diff --git a/build.sh b/build.sh index af8c10b3..81d714b5 100755 --- a/build.sh +++ b/build.sh @@ -92,9 +92,9 @@ fi if [ "$OMR_OPENWRT" = "default" ]; then if [ "$OMR_KERNEL" = "5.4" ]; then # Use OpenWrt 21.02 for 5.4 kernel - _get_repo "$OMR_TARGET/source" https://github.com/openwrt/openwrt "f441be3921c769b732f0148f005d4f1bbace0508" - _get_repo feeds/packages https://github.com/openwrt/packages "3aa30ceee4fcf7b131bdc0f98658391069573e12" - _get_repo feeds/luci https://github.com/openwrt/luci "f28aaa35cd5c0cbbe59d8cc6a67de88ceeac382e" + _get_repo "$OMR_TARGET/source" https://github.com/openwrt/openwrt "4a2cca78245e9291096e7c8c98627426df50ef58" + _get_repo feeds/packages https://github.com/openwrt/packages "978e2265968e36c9dc03004225198b85570f60d2" + _get_repo feeds/luci https://github.com/openwrt/luci "87bed0547156aca6e31ad6c72e2cd45cf198537f" else _get_repo "$OMR_TARGET/source" https://github.com/openwrt/openwrt "8a6b1a8d29cbd62f005ba20998ca9c8048ff49fc" _get_repo feeds/packages https://github.com/openwrt/packages "b5132de5cf4f7d0562445cf3c65f9f1a4bcb1bbf" diff --git a/root/include/kernel-version.mk b/root/include/kernel-version.mk index d60518e8..1073acf6 100755 --- a/root/include/kernel-version.mk +++ b/root/include/kernel-version.mk @@ -6,12 +6,12 @@ ifdef CONFIG_TESTING_KERNEL KERNEL_PATCHVER:=$(KERNEL_TESTING_PATCHVER) endif -LINUX_VERSION-5.4 = .167 +LINUX_VERSION-5.4 = .169 LINUX_VERSION-5.10 = .64 LINUX_VERSION-5.14 = .6 LINUX_VERSION-5.15 = .4 -LINUX_KERNEL_HASH-5.4.167 = b4e43116217ee02009aba7eab3081e64560b81ce42bc6096fcd81257f470a5a7 +LINUX_KERNEL_HASH-5.4.169 = 554382d95f71afd5f9b49292eb5d1cbe3be1a0bad22d21487c9e6d506ee01a19 LINUX_KERNEL_HASH-5.10.64 = 3eb84bd24a2de2b4749314e34597c02401c5d6831b055ed5224adb405c35e30a LINUX_KERNEL_HASH-5.14.6 = 54848c1268771ee3515e4c33e29abc3f1fa90d8144894cce6d0ebc3b158bccec LINUX_KERNEL_HASH-5.15.4 = 549d0fb75e65f6158e6f4becc648f249d386843da0e1211460bde8b1ea99cbca diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-00-MIPS-uasm-Enable-muhu-opcode-for-MIPS-R6.patch b/root/target/linux/generic/backport-5.15/050-v5.16-00-MIPS-uasm-Enable-muhu-opcode-for-MIPS-R6.patch new file mode 100755 index 00000000..82feb742 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-00-MIPS-uasm-Enable-muhu-opcode-for-MIPS-R6.patch @@ -0,0 +1,65 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:02 +0200 +Subject: [PATCH] MIPS: uasm: Enable muhu opcode for MIPS R6 + +Enable the 'muhu' instruction, complementing the existing 'mulu', needed +to implement a MIPS32 BPF JIT. + +Also fix a typo in the existing definition of 'dmulu'. + +Signed-off-by: Tony Ambardar + +This patch is a dependency for my 32-bit MIPS eBPF JIT. + +Signed-off-by: Johan Almbladh +--- + +--- a/arch/mips/include/asm/uasm.h ++++ b/arch/mips/include/asm/uasm.h +@@ -145,6 +145,7 @@ Ip_u1(_mtlo); + Ip_u3u1u2(_mul); + Ip_u1u2(_multu); + Ip_u3u1u2(_mulu); ++Ip_u3u1u2(_muhu); + Ip_u3u1u2(_nor); + Ip_u3u1u2(_or); + Ip_u2u1u3(_ori); +--- a/arch/mips/mm/uasm-mips.c ++++ b/arch/mips/mm/uasm-mips.c +@@ -90,7 +90,7 @@ static const struct insn insn_table[insn + RS | RT | RD}, + [insn_dmtc0] = {M(cop0_op, dmtc_op, 0, 0, 0, 0), RT | RD | SET}, + [insn_dmultu] = {M(spec_op, 0, 0, 0, 0, dmultu_op), RS | RT}, +- [insn_dmulu] = {M(spec_op, 0, 0, 0, dmult_dmul_op, dmultu_op), ++ [insn_dmulu] = {M(spec_op, 0, 0, 0, dmultu_dmulu_op, dmultu_op), + RS | RT | RD}, + [insn_drotr] = {M(spec_op, 1, 0, 0, 0, dsrl_op), RT | RD | RE}, + [insn_drotr32] = {M(spec_op, 1, 0, 0, 0, dsrl32_op), RT | RD | RE}, +@@ -150,6 +150,8 @@ static const struct insn insn_table[insn + [insn_mtlo] = {M(spec_op, 0, 0, 0, 0, mtlo_op), RS}, + [insn_mulu] = {M(spec_op, 0, 0, 0, multu_mulu_op, multu_op), + RS | RT | RD}, ++ [insn_muhu] = {M(spec_op, 0, 0, 0, multu_muhu_op, multu_op), ++ RS | RT | RD}, + #ifndef CONFIG_CPU_MIPSR6 + [insn_mul] = {M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD}, + #else +--- a/arch/mips/mm/uasm.c ++++ b/arch/mips/mm/uasm.c +@@ -59,7 +59,7 @@ enum opcode { + insn_lddir, insn_ldpte, insn_ldx, insn_lh, insn_lhu, insn_ll, insn_lld, + insn_lui, insn_lw, insn_lwu, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, + insn_mflo, insn_modu, insn_movn, insn_movz, insn_mtc0, insn_mthc0, +- insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_nor, ++ insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_muhu, insn_nor, + insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb, insn_sc, + insn_scd, insn_seleqz, insn_selnez, insn_sd, insn_sh, insn_sll, + insn_sllv, insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra, +@@ -344,6 +344,7 @@ I_u1(_mtlo) + I_u3u1u2(_mul) + I_u1u2(_multu) + I_u3u1u2(_mulu) ++I_u3u1u2(_muhu) + I_u3u1u2(_nor) + I_u3u1u2(_or) + I_u2u1u3(_ori) diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-01-mips-uasm-Add-workaround-for-Loongson-2F-nop-CPU-err.patch b/root/target/linux/generic/backport-5.15/050-v5.16-01-mips-uasm-Add-workaround-for-Loongson-2F-nop-CPU-err.patch new file mode 100755 index 00000000..3a4d573f --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-01-mips-uasm-Add-workaround-for-Loongson-2F-nop-CPU-err.patch @@ -0,0 +1,31 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:03 +0200 +Subject: [PATCH] mips: uasm: Add workaround for Loongson-2F nop CPU errata + +This patch implements a workaround for the Loongson-2F nop in generated, +code, if the existing option CONFIG_CPU_NOP_WORKAROUND is set. Before, +the binutils option -mfix-loongson2f-nop was enabled, but no workaround +was done when emitting MIPS code. Now, the nop pseudo instruction is +emitted as "or ax,ax,zero" instead of the default "sll zero,zero,0". This +is consistent with the workaround implemented by binutils. + +Link: https://sourceware.org/legacy-ml/binutils/2009-11/msg00387.html + +Signed-off-by: Johan Almbladh +Reviewed-by: Jiaxun Yang +--- + +--- a/arch/mips/include/asm/uasm.h ++++ b/arch/mips/include/asm/uasm.h +@@ -249,7 +249,11 @@ static inline void uasm_l##lb(struct uas + #define uasm_i_bnezl(buf, rs, off) uasm_i_bnel(buf, rs, 0, off) + #define uasm_i_ehb(buf) uasm_i_sll(buf, 0, 0, 3) + #define uasm_i_move(buf, a, b) UASM_i_ADDU(buf, a, 0, b) ++#ifdef CONFIG_CPU_NOP_WORKAROUNDS ++#define uasm_i_nop(buf) uasm_i_or(buf, 1, 1, 0) ++#else + #define uasm_i_nop(buf) uasm_i_sll(buf, 0, 0, 0) ++#endif + #define uasm_i_ssnop(buf) uasm_i_sll(buf, 0, 0, 1) + + static inline void uasm_i_drotr_safe(u32 **p, unsigned int a1, diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-02-mips-bpf-Add-eBPF-JIT-for-32-bit-MIPS.patch b/root/target/linux/generic/backport-5.15/050-v5.16-02-mips-bpf-Add-eBPF-JIT-for-32-bit-MIPS.patch new file mode 100755 index 00000000..79806599 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-02-mips-bpf-Add-eBPF-JIT-for-32-bit-MIPS.patch @@ -0,0 +1,3078 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:04 +0200 +Subject: [PATCH] mips: bpf: Add eBPF JIT for 32-bit MIPS + +This is an implementation of an eBPF JIT for 32-bit MIPS I-V and MIPS32. +The implementation supports all 32-bit and 64-bit ALU and JMP operations, +including the recently-added atomics. 64-bit div/mod and 64-bit atomics +are implemented using function calls to math64 and atomic64 functions, +respectively. All 32-bit operations are implemented natively by the JIT, +except if the CPU lacks ll/sc instructions. + +Register mapping +================ +All 64-bit eBPF registers are mapped to native 32-bit MIPS register pairs, +and does not use any stack scratch space for register swapping. This means +that all eBPF register data is kept in CPU registers all the time, and +this simplifies the register management a lot. It also reduces the JIT's +pressure on temporary registers since we do not have to move data around. + +Native register pairs are ordered according to CPU endiannes, following +the O32 calling convention for passing 64-bit arguments and return values. +The eBPF return value, arguments and callee-saved registers are mapped to +their native MIPS equivalents. + +Since the 32 highest bits in the eBPF FP (frame pointer) register are +always zero, only one general-purpose register is actually needed for the +mapping. The MIPS fp register is used for this purpose. The high bits are +mapped to MIPS register r0. This saves us one CPU register, which is much +needed for temporaries, while still allowing us to treat the R10 (FP) +register just like any other eBPF register in the JIT. + +The MIPS gp (global pointer) and at (assembler temporary) registers are +used as internal temporary registers for constant blinding. CPU registers +t6-t9 are used internally by the JIT when constructing more complex 64-bit +operations. This is precisely what is needed - two registers to store an +operand value, and two more as scratch registers when performing the +operation. + +The register mapping is shown below. + + R0 - $v1, $v0 return value + R1 - $a1, $a0 argument 1, passed in registers + R2 - $a3, $a2 argument 2, passed in registers + R3 - $t1, $t0 argument 3, passed on stack + R4 - $t3, $t2 argument 4, passed on stack + R5 - $t4, $t3 argument 5, passed on stack + R6 - $s1, $s0 callee-saved + R7 - $s3, $s2 callee-saved + R8 - $s5, $s4 callee-saved + R9 - $s7, $s6 callee-saved + FP - $r0, $fp 32-bit frame pointer + AX - $gp, $at constant-blinding + $t6 - $t9 unallocated, JIT temporaries + +Jump offsets +============ +The JIT tries to map all conditional JMP operations to MIPS conditional +PC-relative branches. The MIPS branch offset field is 18 bits, in bytes, +which is equivalent to the eBPF 16-bit instruction offset. However, since +the JIT may emit more than one CPU instruction per eBPF instruction, the +field width may overflow. If that happens, the JIT converts the long +conditional jump to a short PC-relative branch with the condition +inverted, jumping over a long unconditional absolute jmp (j). + +This conversion will change the instruction offset mapping used for jumps, +and may in turn result in more branch offset overflows. The JIT therefore +dry-runs the translation until no more branches are converted and the +offsets do not change anymore. There is an upper bound on this of course, +and if the JIT hits that limit, the last two iterations are run with all +branches being converted. + +Tail call count +=============== +The current tail call count is stored in the 16-byte area of the caller's +stack frame that is reserved for the callee in the o32 ABI. The value is +initialized in the prologue, and propagated to the tail-callee by skipping +the initialization instructions when emitting the tail call. + +Signed-off-by: Johan Almbladh +--- + create mode 100644 arch/mips/net/bpf_jit_comp.c + create mode 100644 arch/mips/net/bpf_jit_comp.h + create mode 100644 arch/mips/net/bpf_jit_comp32.c + +--- a/arch/mips/net/Makefile ++++ b/arch/mips/net/Makefile +@@ -2,4 +2,9 @@ + # MIPS networking code + + obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o +-obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o ++ ++ifeq ($(CONFIG_32BIT),y) ++ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o bpf_jit_comp32.o ++else ++ obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o ++endif +--- /dev/null ++++ b/arch/mips/net/bpf_jit_comp.c +@@ -0,0 +1,1020 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Just-In-Time compiler for eBPF bytecode on MIPS. ++ * Implementation of JIT functions common to 32-bit and 64-bit CPUs. ++ * ++ * Copyright (c) 2021 Anyfi Networks AB. ++ * Author: Johan Almbladh ++ * ++ * Based on code and ideas from ++ * Copyright (c) 2017 Cavium, Inc. ++ * Copyright (c) 2017 Shubham Bansal ++ * Copyright (c) 2011 Mircea Gherzan ++ */ ++ ++/* ++ * Code overview ++ * ============= ++ * ++ * - bpf_jit_comp.h ++ * Common definitions and utilities. ++ * ++ * - bpf_jit_comp.c ++ * Implementation of JIT top-level logic and exported JIT API functions. ++ * Implementation of internal operations shared by 32-bit and 64-bit code. ++ * JMP and ALU JIT control code, register control code, shared ALU and ++ * JMP/JMP32 JIT operations. ++ * ++ * - bpf_jit_comp32.c ++ * Implementation of functions to JIT prologue, epilogue and a single eBPF ++ * instruction for 32-bit MIPS CPUs. The functions use shared operations ++ * where possible, and implement the rest for 32-bit MIPS such as ALU64 ++ * operations. ++ * ++ * - bpf_jit_comp64.c ++ * Ditto, for 64-bit MIPS CPUs. ++ * ++ * Zero and sign extension ++ * ======================== ++ * 32-bit MIPS instructions on 64-bit MIPS registers use sign extension, ++ * but the eBPF instruction set mandates zero extension. We let the verifier ++ * insert explicit zero-extensions after 32-bit ALU operations, both for ++ * 32-bit and 64-bit MIPS JITs. Conditional JMP32 operations on 64-bit MIPs ++ * are JITed with sign extensions inserted when so expected. ++ * ++ * ALU operations ++ * ============== ++ * ALU operations on 32/64-bit MIPS and ALU64 operations on 64-bit MIPS are ++ * JITed in the following steps. ALU64 operations on 32-bit MIPS are more ++ * complicated and therefore only processed by special implementations in ++ * step (3). ++ * ++ * 1) valid_alu_i: ++ * Determine if an immediate operation can be emitted as such, or if ++ * we must fall back to the register version. ++ * ++ * 2) rewrite_alu_i: ++ * Convert BPF operation and immediate value to a canonical form for ++ * JITing. In some degenerate cases this form may be a no-op. ++ * ++ * 3) emit_alu_{i,i64,r,64}: ++ * Emit instructions for an ALU or ALU64 immediate or register operation. ++ * ++ * JMP operations ++ * ============== ++ * JMP and JMP32 operations require an JIT instruction offset table for ++ * translating the jump offset. This table is computed by dry-running the ++ * JIT without actually emitting anything. However, the computed PC-relative ++ * offset may overflow the 18-bit offset field width of the native MIPS ++ * branch instruction. In such cases, the long jump is converted into the ++ * following sequence. ++ * ++ * ! +2 Inverted PC-relative branch ++ * nop Delay slot ++ * j Unconditional absolute long jump ++ * nop Delay slot ++ * ++ * Since this converted sequence alters the offset table, all offsets must ++ * be re-calculated. This may in turn trigger new branch conversions, so ++ * the process is repeated until no further changes are made. Normally it ++ * completes in 1-2 iterations. If JIT_MAX_ITERATIONS should reached, we ++ * fall back to converting every remaining jump operation. The branch ++ * conversion is independent of how the JMP or JMP32 condition is JITed. ++ * ++ * JMP32 and JMP operations are JITed as follows. ++ * ++ * 1) setup_jmp_{i,r}: ++ * Convert jump conditional and offset into a form that can be JITed. ++ * This form may be a no-op, a canonical form, or an inverted PC-relative ++ * jump if branch conversion is necessary. ++ * ++ * 2) valid_jmp_i: ++ * Determine if an immediate operations can be emitted as such, or if ++ * we must fall back to the register version. Applies to JMP32 for 32-bit ++ * MIPS, and both JMP and JMP32 for 64-bit MIPS. ++ * ++ * 3) emit_jmp_{i,i64,r,r64}: ++ * Emit instructions for an JMP or JMP32 immediate or register operation. ++ * ++ * 4) finish_jmp_{i,r}: ++ * Emit any instructions needed to finish the jump. This includes a nop ++ * for the delay slot if a branch was emitted, and a long absolute jump ++ * if the branch was converted. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bpf_jit_comp.h" ++ ++/* Convenience macros for descriptor access */ ++#define CONVERTED(desc) ((desc) & JIT_DESC_CONVERT) ++#define INDEX(desc) ((desc) & ~JIT_DESC_CONVERT) ++ ++/* ++ * Push registers on the stack, starting at a given depth from the stack ++ * pointer and increasing. The next depth to be written is returned. ++ */ ++int push_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth) ++{ ++ int reg; ++ ++ for (reg = 0; reg < BITS_PER_BYTE * sizeof(mask); reg++) ++ if (mask & BIT(reg)) { ++ if ((excl & BIT(reg)) == 0) { ++ if (sizeof(long) == 4) ++ emit(ctx, sw, reg, depth, MIPS_R_SP); ++ else /* sizeof(long) == 8 */ ++ emit(ctx, sd, reg, depth, MIPS_R_SP); ++ } ++ depth += sizeof(long); ++ } ++ ++ ctx->stack_used = max((int)ctx->stack_used, depth); ++ return depth; ++} ++ ++/* ++ * Pop registers from the stack, starting at a given depth from the stack ++ * pointer and increasing. The next depth to be read is returned. ++ */ ++int pop_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth) ++{ ++ int reg; ++ ++ for (reg = 0; reg < BITS_PER_BYTE * sizeof(mask); reg++) ++ if (mask & BIT(reg)) { ++ if ((excl & BIT(reg)) == 0) { ++ if (sizeof(long) == 4) ++ emit(ctx, lw, reg, depth, MIPS_R_SP); ++ else /* sizeof(long) == 8 */ ++ emit(ctx, ld, reg, depth, MIPS_R_SP); ++ } ++ depth += sizeof(long); ++ } ++ ++ return depth; ++} ++ ++/* Compute the 28-bit jump target address from a BPF program location */ ++int get_target(struct jit_context *ctx, u32 loc) ++{ ++ u32 index = INDEX(ctx->descriptors[loc]); ++ unsigned long pc = (unsigned long)&ctx->target[ctx->jit_index]; ++ unsigned long addr = (unsigned long)&ctx->target[index]; ++ ++ if (!ctx->target) ++ return 0; ++ ++ if ((addr ^ pc) & ~MIPS_JMP_MASK) ++ return -1; ++ ++ return addr & MIPS_JMP_MASK; ++} ++ ++/* Compute the PC-relative offset to relative BPF program offset */ ++int get_offset(const struct jit_context *ctx, int off) ++{ ++ return (INDEX(ctx->descriptors[ctx->bpf_index + off]) - ++ ctx->jit_index - 1) * sizeof(u32); ++} ++ ++/* dst = imm (register width) */ ++void emit_mov_i(struct jit_context *ctx, u8 dst, s32 imm) ++{ ++ if (imm >= -0x8000 && imm <= 0x7fff) { ++ emit(ctx, addiu, dst, MIPS_R_ZERO, imm); ++ } else { ++ emit(ctx, lui, dst, (s16)((u32)imm >> 16)); ++ emit(ctx, ori, dst, dst, (u16)(imm & 0xffff)); ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* dst = src (register width) */ ++void emit_mov_r(struct jit_context *ctx, u8 dst, u8 src) ++{ ++ emit(ctx, ori, dst, src, 0); ++ clobber_reg(ctx, dst); ++} ++ ++/* Validate ALU immediate range */ ++bool valid_alu_i(u8 op, s32 imm) ++{ ++ switch (BPF_OP(op)) { ++ case BPF_NEG: ++ case BPF_LSH: ++ case BPF_RSH: ++ case BPF_ARSH: ++ /* All legal eBPF values are valid */ ++ return true; ++ case BPF_ADD: ++ /* imm must be 16 bits */ ++ return imm >= -0x8000 && imm <= 0x7fff; ++ case BPF_SUB: ++ /* -imm must be 16 bits */ ++ return imm >= -0x7fff && imm <= 0x8000; ++ case BPF_AND: ++ case BPF_OR: ++ case BPF_XOR: ++ /* imm must be 16 bits unsigned */ ++ return imm >= 0 && imm <= 0xffff; ++ case BPF_MUL: ++ /* imm must be zero or a positive power of two */ ++ return imm == 0 || (imm > 0 && is_power_of_2(imm)); ++ case BPF_DIV: ++ case BPF_MOD: ++ /* imm must be an 17-bit power of two */ ++ return (u32)imm <= 0x10000 && is_power_of_2((u32)imm); ++ } ++ return false; ++} ++ ++/* Rewrite ALU immediate operation */ ++bool rewrite_alu_i(u8 op, s32 imm, u8 *alu, s32 *val) ++{ ++ bool act = true; ++ ++ switch (BPF_OP(op)) { ++ case BPF_LSH: ++ case BPF_RSH: ++ case BPF_ARSH: ++ case BPF_ADD: ++ case BPF_SUB: ++ case BPF_OR: ++ case BPF_XOR: ++ /* imm == 0 is a no-op */ ++ act = imm != 0; ++ break; ++ case BPF_MUL: ++ if (imm == 1) { ++ /* dst * 1 is a no-op */ ++ act = false; ++ } else if (imm == 0) { ++ /* dst * 0 is dst & 0 */ ++ op = BPF_AND; ++ } else { ++ /* dst * (1 << n) is dst << n */ ++ op = BPF_LSH; ++ imm = ilog2(abs(imm)); ++ } ++ break; ++ case BPF_DIV: ++ if (imm == 1) { ++ /* dst / 1 is a no-op */ ++ act = false; ++ } else { ++ /* dst / (1 << n) is dst >> n */ ++ op = BPF_RSH; ++ imm = ilog2(imm); ++ } ++ break; ++ case BPF_MOD: ++ /* dst % (1 << n) is dst & ((1 << n) - 1) */ ++ op = BPF_AND; ++ imm--; ++ break; ++ } ++ ++ *alu = op; ++ *val = imm; ++ return act; ++} ++ ++/* ALU immediate operation (32-bit) */ ++void emit_alu_i(struct jit_context *ctx, u8 dst, s32 imm, u8 op) ++{ ++ switch (BPF_OP(op)) { ++ /* dst = -dst */ ++ case BPF_NEG: ++ emit(ctx, subu, dst, MIPS_R_ZERO, dst); ++ break; ++ /* dst = dst & imm */ ++ case BPF_AND: ++ emit(ctx, andi, dst, dst, (u16)imm); ++ break; ++ /* dst = dst | imm */ ++ case BPF_OR: ++ emit(ctx, ori, dst, dst, (u16)imm); ++ break; ++ /* dst = dst ^ imm */ ++ case BPF_XOR: ++ emit(ctx, xori, dst, dst, (u16)imm); ++ break; ++ /* dst = dst << imm */ ++ case BPF_LSH: ++ emit(ctx, sll, dst, dst, imm); ++ break; ++ /* dst = dst >> imm */ ++ case BPF_RSH: ++ emit(ctx, srl, dst, dst, imm); ++ break; ++ /* dst = dst >> imm (arithmetic) */ ++ case BPF_ARSH: ++ emit(ctx, sra, dst, dst, imm); ++ break; ++ /* dst = dst + imm */ ++ case BPF_ADD: ++ emit(ctx, addiu, dst, dst, imm); ++ break; ++ /* dst = dst - imm */ ++ case BPF_SUB: ++ emit(ctx, addiu, dst, dst, -imm); ++ break; ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* ALU register operation (32-bit) */ ++void emit_alu_r(struct jit_context *ctx, u8 dst, u8 src, u8 op) ++{ ++ switch (BPF_OP(op)) { ++ /* dst = dst & src */ ++ case BPF_AND: ++ emit(ctx, and, dst, dst, src); ++ break; ++ /* dst = dst | src */ ++ case BPF_OR: ++ emit(ctx, or, dst, dst, src); ++ break; ++ /* dst = dst ^ src */ ++ case BPF_XOR: ++ emit(ctx, xor, dst, dst, src); ++ break; ++ /* dst = dst << src */ ++ case BPF_LSH: ++ emit(ctx, sllv, dst, dst, src); ++ break; ++ /* dst = dst >> src */ ++ case BPF_RSH: ++ emit(ctx, srlv, dst, dst, src); ++ break; ++ /* dst = dst >> src (arithmetic) */ ++ case BPF_ARSH: ++ emit(ctx, srav, dst, dst, src); ++ break; ++ /* dst = dst + src */ ++ case BPF_ADD: ++ emit(ctx, addu, dst, dst, src); ++ break; ++ /* dst = dst - src */ ++ case BPF_SUB: ++ emit(ctx, subu, dst, dst, src); ++ break; ++ /* dst = dst * src */ ++ case BPF_MUL: ++ if (cpu_has_mips32r1 || cpu_has_mips32r6) { ++ emit(ctx, mul, dst, dst, src); ++ } else { ++ emit(ctx, multu, dst, src); ++ emit(ctx, mflo, dst); ++ } ++ break; ++ /* dst = dst / src */ ++ case BPF_DIV: ++ if (cpu_has_mips32r6) { ++ emit(ctx, divu_r6, dst, dst, src); ++ } else { ++ emit(ctx, divu, dst, src); ++ emit(ctx, mflo, dst); ++ } ++ break; ++ /* dst = dst % src */ ++ case BPF_MOD: ++ if (cpu_has_mips32r6) { ++ emit(ctx, modu, dst, dst, src); ++ } else { ++ emit(ctx, divu, dst, src); ++ emit(ctx, mfhi, dst); ++ } ++ break; ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Atomic read-modify-write (32-bit) */ ++void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code) ++{ ++ emit(ctx, ll, MIPS_R_T9, off, dst); ++ switch (code) { ++ case BPF_ADD: ++ emit(ctx, addu, MIPS_R_T8, MIPS_R_T9, src); ++ break; ++ case BPF_AND: ++ emit(ctx, and, MIPS_R_T8, MIPS_R_T9, src); ++ break; ++ case BPF_OR: ++ emit(ctx, or, MIPS_R_T8, MIPS_R_T9, src); ++ break; ++ case BPF_XOR: ++ emit(ctx, xor, MIPS_R_T8, MIPS_R_T9, src); ++ break; ++ } ++ emit(ctx, sc, MIPS_R_T8, off, dst); ++ emit(ctx, beqz, MIPS_R_T8, -16); ++ emit(ctx, nop); /* Delay slot */ ++} ++ ++/* Atomic compare-and-exchange (32-bit) */ ++void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off) ++{ ++ emit(ctx, ll, MIPS_R_T9, off, dst); ++ emit(ctx, bne, MIPS_R_T9, res, 12); ++ emit(ctx, move, MIPS_R_T8, src); /* Delay slot */ ++ emit(ctx, sc, MIPS_R_T8, off, dst); ++ emit(ctx, beqz, MIPS_R_T8, -20); ++ emit(ctx, move, res, MIPS_R_T9); /* Delay slot */ ++ clobber_reg(ctx, res); ++} ++ ++/* Swap bytes and truncate a register word or half word */ ++void emit_bswap_r(struct jit_context *ctx, u8 dst, u32 width) ++{ ++ u8 tmp = MIPS_R_T8; ++ u8 msk = MIPS_R_T9; ++ ++ switch (width) { ++ /* Swap bytes in a word */ ++ case 32: ++ if (cpu_has_mips32r2 || cpu_has_mips32r6) { ++ emit(ctx, wsbh, dst, dst); ++ emit(ctx, rotr, dst, dst, 16); ++ } else { ++ emit(ctx, sll, tmp, dst, 16); /* tmp = dst << 16 */ ++ emit(ctx, srl, dst, dst, 16); /* dst = dst >> 16 */ ++ emit(ctx, or, dst, dst, tmp); /* dst = dst | tmp */ ++ ++ emit(ctx, lui, msk, 0xff); /* msk = 0x00ff0000 */ ++ emit(ctx, ori, msk, msk, 0xff); /* msk = msk | 0xff */ ++ ++ emit(ctx, and, tmp, dst, msk); /* tmp = dst & msk */ ++ emit(ctx, sll, tmp, tmp, 8); /* tmp = tmp << 8 */ ++ emit(ctx, srl, dst, dst, 8); /* dst = dst >> 8 */ ++ emit(ctx, and, dst, dst, msk); /* dst = dst & msk */ ++ emit(ctx, or, dst, dst, tmp); /* reg = dst | tmp */ ++ } ++ break; ++ /* Swap bytes in a half word */ ++ case 16: ++ if (cpu_has_mips32r2 || cpu_has_mips32r6) { ++ emit(ctx, wsbh, dst, dst); ++ emit(ctx, andi, dst, dst, 0xffff); ++ } else { ++ emit(ctx, andi, tmp, dst, 0xff00); /* t = d & 0xff00 */ ++ emit(ctx, srl, tmp, tmp, 8); /* t = t >> 8 */ ++ emit(ctx, andi, dst, dst, 0x00ff); /* d = d & 0x00ff */ ++ emit(ctx, sll, dst, dst, 8); /* d = d << 8 */ ++ emit(ctx, or, dst, dst, tmp); /* d = d | t */ ++ } ++ break; ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Validate jump immediate range */ ++bool valid_jmp_i(u8 op, s32 imm) ++{ ++ switch (op) { ++ case JIT_JNOP: ++ /* Immediate value not used */ ++ return true; ++ case BPF_JEQ: ++ case BPF_JNE: ++ /* No immediate operation */ ++ return false; ++ case BPF_JSET: ++ case JIT_JNSET: ++ /* imm must be 16 bits unsigned */ ++ return imm >= 0 && imm <= 0xffff; ++ case BPF_JGE: ++ case BPF_JLT: ++ case BPF_JSGE: ++ case BPF_JSLT: ++ /* imm must be 16 bits */ ++ return imm >= -0x8000 && imm <= 0x7fff; ++ case BPF_JGT: ++ case BPF_JLE: ++ case BPF_JSGT: ++ case BPF_JSLE: ++ /* imm + 1 must be 16 bits */ ++ return imm >= -0x8001 && imm <= 0x7ffe; ++ } ++ return false; ++} ++ ++/* Invert a conditional jump operation */ ++static u8 invert_jmp(u8 op) ++{ ++ switch (op) { ++ case BPF_JA: return JIT_JNOP; ++ case BPF_JEQ: return BPF_JNE; ++ case BPF_JNE: return BPF_JEQ; ++ case BPF_JSET: return JIT_JNSET; ++ case BPF_JGT: return BPF_JLE; ++ case BPF_JGE: return BPF_JLT; ++ case BPF_JLT: return BPF_JGE; ++ case BPF_JLE: return BPF_JGT; ++ case BPF_JSGT: return BPF_JSLE; ++ case BPF_JSGE: return BPF_JSLT; ++ case BPF_JSLT: return BPF_JSGE; ++ case BPF_JSLE: return BPF_JSGT; ++ } ++ return 0; ++} ++ ++/* Prepare a PC-relative jump operation */ ++static void setup_jmp(struct jit_context *ctx, u8 bpf_op, ++ s16 bpf_off, u8 *jit_op, s32 *jit_off) ++{ ++ u32 *descp = &ctx->descriptors[ctx->bpf_index]; ++ int op = bpf_op; ++ int offset = 0; ++ ++ /* Do not compute offsets on the first pass */ ++ if (INDEX(*descp) == 0) ++ goto done; ++ ++ /* Skip jumps never taken */ ++ if (bpf_op == JIT_JNOP) ++ goto done; ++ ++ /* Convert jumps always taken */ ++ if (bpf_op == BPF_JA) ++ *descp |= JIT_DESC_CONVERT; ++ ++ /* ++ * Current ctx->jit_index points to the start of the branch preamble. ++ * Since the preamble differs among different branch conditionals, ++ * the current index cannot be used to compute the branch offset. ++ * Instead, we use the offset table value for the next instruction, ++ * which gives the index immediately after the branch delay slot. ++ */ ++ if (!CONVERTED(*descp)) { ++ int target = ctx->bpf_index + bpf_off + 1; ++ int origin = ctx->bpf_index + 1; ++ ++ offset = (INDEX(ctx->descriptors[target]) - ++ INDEX(ctx->descriptors[origin]) + 1) * sizeof(u32); ++ } ++ ++ /* ++ * The PC-relative branch offset field on MIPS is 18 bits signed, ++ * so if the computed offset is larger than this we generate a an ++ * absolute jump that we skip with an inverted conditional branch. ++ */ ++ if (CONVERTED(*descp) || offset < -0x20000 || offset > 0x1ffff) { ++ offset = 3 * sizeof(u32); ++ op = invert_jmp(bpf_op); ++ ctx->changes += !CONVERTED(*descp); ++ *descp |= JIT_DESC_CONVERT; ++ } ++ ++done: ++ *jit_off = offset; ++ *jit_op = op; ++} ++ ++/* Prepare a PC-relative jump operation with immediate conditional */ ++void setup_jmp_i(struct jit_context *ctx, s32 imm, u8 width, ++ u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off) ++{ ++ bool always = false; ++ bool never = false; ++ ++ switch (bpf_op) { ++ case BPF_JEQ: ++ case BPF_JNE: ++ break; ++ case BPF_JSET: ++ case BPF_JLT: ++ never = imm == 0; ++ break; ++ case BPF_JGE: ++ always = imm == 0; ++ break; ++ case BPF_JGT: ++ never = (u32)imm == U32_MAX; ++ break; ++ case BPF_JLE: ++ always = (u32)imm == U32_MAX; ++ break; ++ case BPF_JSGT: ++ never = imm == S32_MAX && width == 32; ++ break; ++ case BPF_JSGE: ++ always = imm == S32_MIN && width == 32; ++ break; ++ case BPF_JSLT: ++ never = imm == S32_MIN && width == 32; ++ break; ++ case BPF_JSLE: ++ always = imm == S32_MAX && width == 32; ++ break; ++ } ++ ++ if (never) ++ bpf_op = JIT_JNOP; ++ if (always) ++ bpf_op = BPF_JA; ++ setup_jmp(ctx, bpf_op, bpf_off, jit_op, jit_off); ++} ++ ++/* Prepare a PC-relative jump operation with register conditional */ ++void setup_jmp_r(struct jit_context *ctx, bool same_reg, ++ u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off) ++{ ++ switch (bpf_op) { ++ case BPF_JSET: ++ break; ++ case BPF_JEQ: ++ case BPF_JGE: ++ case BPF_JLE: ++ case BPF_JSGE: ++ case BPF_JSLE: ++ if (same_reg) ++ bpf_op = BPF_JA; ++ break; ++ case BPF_JNE: ++ case BPF_JLT: ++ case BPF_JGT: ++ case BPF_JSGT: ++ case BPF_JSLT: ++ if (same_reg) ++ bpf_op = JIT_JNOP; ++ break; ++ } ++ setup_jmp(ctx, bpf_op, bpf_off, jit_op, jit_off); ++} ++ ++/* Finish a PC-relative jump operation */ ++int finish_jmp(struct jit_context *ctx, u8 jit_op, s16 bpf_off) ++{ ++ /* Emit conditional branch delay slot */ ++ if (jit_op != JIT_JNOP) ++ emit(ctx, nop); ++ /* ++ * Emit an absolute long jump with delay slot, ++ * if the PC-relative branch was converted. ++ */ ++ if (CONVERTED(ctx->descriptors[ctx->bpf_index])) { ++ int target = get_target(ctx, ctx->bpf_index + bpf_off + 1); ++ ++ if (target < 0) ++ return -1; ++ emit(ctx, j, target); ++ emit(ctx, nop); ++ } ++ return 0; ++} ++ ++/* Jump immediate (32-bit) */ ++void emit_jmp_i(struct jit_context *ctx, u8 dst, s32 imm, s32 off, u8 op) ++{ ++ switch (op) { ++ /* No-op, used internally for branch optimization */ ++ case JIT_JNOP: ++ break; ++ /* PC += off if dst & imm */ ++ case BPF_JSET: ++ emit(ctx, andi, MIPS_R_T9, dst, (u16)imm); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */ ++ case JIT_JNSET: ++ emit(ctx, andi, MIPS_R_T9, dst, (u16)imm); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst > imm */ ++ case BPF_JGT: ++ emit(ctx, sltiu, MIPS_R_T9, dst, imm + 1); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst >= imm */ ++ case BPF_JGE: ++ emit(ctx, sltiu, MIPS_R_T9, dst, imm); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst < imm */ ++ case BPF_JLT: ++ emit(ctx, sltiu, MIPS_R_T9, dst, imm); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst <= imm */ ++ case BPF_JLE: ++ emit(ctx, sltiu, MIPS_R_T9, dst, imm + 1); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst > imm (signed) */ ++ case BPF_JSGT: ++ emit(ctx, slti, MIPS_R_T9, dst, imm + 1); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst >= imm (signed) */ ++ case BPF_JSGE: ++ emit(ctx, slti, MIPS_R_T9, dst, imm); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst < imm (signed) */ ++ case BPF_JSLT: ++ emit(ctx, slti, MIPS_R_T9, dst, imm); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst <= imm (signed) */ ++ case BPF_JSLE: ++ emit(ctx, slti, MIPS_R_T9, dst, imm + 1); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ } ++} ++ ++/* Jump register (32-bit) */ ++void emit_jmp_r(struct jit_context *ctx, u8 dst, u8 src, s32 off, u8 op) ++{ ++ switch (op) { ++ /* No-op, used internally for branch optimization */ ++ case JIT_JNOP: ++ break; ++ /* PC += off if dst == src */ ++ case BPF_JEQ: ++ emit(ctx, beq, dst, src, off); ++ break; ++ /* PC += off if dst != src */ ++ case BPF_JNE: ++ emit(ctx, bne, dst, src, off); ++ break; ++ /* PC += off if dst & src */ ++ case BPF_JSET: ++ emit(ctx, and, MIPS_R_T9, dst, src); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */ ++ case JIT_JNSET: ++ emit(ctx, and, MIPS_R_T9, dst, src); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst > src */ ++ case BPF_JGT: ++ emit(ctx, sltu, MIPS_R_T9, src, dst); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst >= src */ ++ case BPF_JGE: ++ emit(ctx, sltu, MIPS_R_T9, dst, src); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst < src */ ++ case BPF_JLT: ++ emit(ctx, sltu, MIPS_R_T9, dst, src); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst <= src */ ++ case BPF_JLE: ++ emit(ctx, sltu, MIPS_R_T9, src, dst); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst > src (signed) */ ++ case BPF_JSGT: ++ emit(ctx, slt, MIPS_R_T9, src, dst); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst >= src (signed) */ ++ case BPF_JSGE: ++ emit(ctx, slt, MIPS_R_T9, dst, src); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst < src (signed) */ ++ case BPF_JSLT: ++ emit(ctx, slt, MIPS_R_T9, dst, src); ++ emit(ctx, bnez, MIPS_R_T9, off); ++ break; ++ /* PC += off if dst <= src (signed) */ ++ case BPF_JSLE: ++ emit(ctx, slt, MIPS_R_T9, src, dst); ++ emit(ctx, beqz, MIPS_R_T9, off); ++ break; ++ } ++} ++ ++/* Jump always */ ++int emit_ja(struct jit_context *ctx, s16 off) ++{ ++ int target = get_target(ctx, ctx->bpf_index + off + 1); ++ ++ if (target < 0) ++ return -1; ++ emit(ctx, j, target); ++ emit(ctx, nop); ++ return 0; ++} ++ ++/* Jump to epilogue */ ++int emit_exit(struct jit_context *ctx) ++{ ++ int target = get_target(ctx, ctx->program->len); ++ ++ if (target < 0) ++ return -1; ++ emit(ctx, j, target); ++ emit(ctx, nop); ++ return 0; ++} ++ ++/* Build the program body from eBPF bytecode */ ++static int build_body(struct jit_context *ctx) ++{ ++ const struct bpf_prog *prog = ctx->program; ++ unsigned int i; ++ ++ ctx->stack_used = 0; ++ for (i = 0; i < prog->len; i++) { ++ const struct bpf_insn *insn = &prog->insnsi[i]; ++ u32 *descp = &ctx->descriptors[i]; ++ int ret; ++ ++ access_reg(ctx, insn->src_reg); ++ access_reg(ctx, insn->dst_reg); ++ ++ ctx->bpf_index = i; ++ if (ctx->target == NULL) { ++ ctx->changes += INDEX(*descp) != ctx->jit_index; ++ *descp &= JIT_DESC_CONVERT; ++ *descp |= ctx->jit_index; ++ } ++ ++ ret = build_insn(insn, ctx); ++ if (ret < 0) ++ return ret; ++ ++ if (ret > 0) { ++ i++; ++ if (ctx->target == NULL) ++ descp[1] = ctx->jit_index; ++ } ++ } ++ ++ /* Store the end offset, where the epilogue begins */ ++ ctx->descriptors[prog->len] = ctx->jit_index; ++ return 0; ++} ++ ++/* Set the branch conversion flag on all instructions */ ++static void set_convert_flag(struct jit_context *ctx, bool enable) ++{ ++ const struct bpf_prog *prog = ctx->program; ++ u32 flag = enable ? JIT_DESC_CONVERT : 0; ++ unsigned int i; ++ ++ for (i = 0; i <= prog->len; i++) ++ ctx->descriptors[i] = INDEX(ctx->descriptors[i]) | flag; ++} ++ ++static void jit_fill_hole(void *area, unsigned int size) ++{ ++ u32 *p; ++ ++ /* We are guaranteed to have aligned memory. */ ++ for (p = area; size >= sizeof(u32); size -= sizeof(u32)) ++ uasm_i_break(&p, BRK_BUG); /* Increments p */ ++} ++ ++bool bpf_jit_needs_zext(void) ++{ ++ return true; ++} ++ ++struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ++{ ++ struct bpf_prog *tmp, *orig_prog = prog; ++ struct bpf_binary_header *header = NULL; ++ struct jit_context ctx; ++ bool tmp_blinded = false; ++ unsigned int tmp_idx; ++ unsigned int image_size; ++ u8 *image_ptr; ++ int tries; ++ ++ /* ++ * If BPF JIT was not enabled then we must fall back to ++ * the interpreter. ++ */ ++ if (!prog->jit_requested) ++ return orig_prog; ++ /* ++ * If constant blinding was enabled and we failed during blinding ++ * then we must fall back to the interpreter. Otherwise, we save ++ * the new JITed code. ++ */ ++ tmp = bpf_jit_blind_constants(prog); ++ if (IS_ERR(tmp)) ++ return orig_prog; ++ if (tmp != prog) { ++ tmp_blinded = true; ++ prog = tmp; ++ } ++ ++ memset(&ctx, 0, sizeof(ctx)); ++ ctx.program = prog; ++ ++ /* ++ * Not able to allocate memory for descriptors[], then ++ * we must fall back to the interpreter ++ */ ++ ctx.descriptors = kcalloc(prog->len + 1, sizeof(*ctx.descriptors), ++ GFP_KERNEL); ++ if (ctx.descriptors == NULL) ++ goto out_err; ++ ++ /* First pass discovers used resources */ ++ if (build_body(&ctx) < 0) ++ goto out_err; ++ /* ++ * Second pass computes instruction offsets. ++ * If any PC-relative branches are out of range, a sequence of ++ * a PC-relative branch + a jump is generated, and we have to ++ * try again from the beginning to generate the new offsets. ++ * This is done until no additional conversions are necessary. ++ * The last two iterations are done with all branches being ++ * converted, to guarantee offset table convergence within a ++ * fixed number of iterations. ++ */ ++ ctx.jit_index = 0; ++ build_prologue(&ctx); ++ tmp_idx = ctx.jit_index; ++ ++ tries = JIT_MAX_ITERATIONS; ++ do { ++ ctx.jit_index = tmp_idx; ++ ctx.changes = 0; ++ if (tries == 2) ++ set_convert_flag(&ctx, true); ++ if (build_body(&ctx) < 0) ++ goto out_err; ++ } while (ctx.changes > 0 && --tries > 0); ++ ++ if (WARN_ONCE(ctx.changes > 0, "JIT offsets failed to converge")) ++ goto out_err; ++ ++ build_epilogue(&ctx, MIPS_R_RA); ++ ++ /* Now we know the size of the structure to make */ ++ image_size = sizeof(u32) * ctx.jit_index; ++ header = bpf_jit_binary_alloc(image_size, &image_ptr, ++ sizeof(u32), jit_fill_hole); ++ /* ++ * Not able to allocate memory for the structure then ++ * we must fall back to the interpretation ++ */ ++ if (header == NULL) ++ goto out_err; ++ ++ /* Actual pass to generate final JIT code */ ++ ctx.target = (u32 *)image_ptr; ++ ctx.jit_index = 0; ++ ++ /* ++ * If building the JITed code fails somehow, ++ * we fall back to the interpretation. ++ */ ++ build_prologue(&ctx); ++ if (build_body(&ctx) < 0) ++ goto out_err; ++ build_epilogue(&ctx, MIPS_R_RA); ++ ++ /* Populate line info meta data */ ++ set_convert_flag(&ctx, false); ++ bpf_prog_fill_jited_linfo(prog, &ctx.descriptors[1]); ++ ++ /* Set as read-only exec and flush instruction cache */ ++ bpf_jit_binary_lock_ro(header); ++ flush_icache_range((unsigned long)header, ++ (unsigned long)&ctx.target[ctx.jit_index]); ++ ++ if (bpf_jit_enable > 1) ++ bpf_jit_dump(prog->len, image_size, 2, ctx.target); ++ ++ prog->bpf_func = (void *)ctx.target; ++ prog->jited = 1; ++ prog->jited_len = image_size; ++ ++out: ++ if (tmp_blinded) ++ bpf_jit_prog_release_other(prog, prog == orig_prog ? ++ tmp : orig_prog); ++ kfree(ctx.descriptors); ++ return prog; ++ ++out_err: ++ prog = orig_prog; ++ if (header) ++ bpf_jit_binary_free(header); ++ goto out; ++} +--- /dev/null ++++ b/arch/mips/net/bpf_jit_comp.h +@@ -0,0 +1,211 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++ * Just-In-Time compiler for eBPF bytecode on 32-bit and 64-bit MIPS. ++ * ++ * Copyright (c) 2021 Anyfi Networks AB. ++ * Author: Johan Almbladh ++ * ++ * Based on code and ideas from ++ * Copyright (c) 2017 Cavium, Inc. ++ * Copyright (c) 2017 Shubham Bansal ++ * Copyright (c) 2011 Mircea Gherzan ++ */ ++ ++#ifndef _BPF_JIT_COMP_H ++#define _BPF_JIT_COMP_H ++ ++/* MIPS registers */ ++#define MIPS_R_ZERO 0 /* Const zero */ ++#define MIPS_R_AT 1 /* Asm temp */ ++#define MIPS_R_V0 2 /* Result */ ++#define MIPS_R_V1 3 /* Result */ ++#define MIPS_R_A0 4 /* Argument */ ++#define MIPS_R_A1 5 /* Argument */ ++#define MIPS_R_A2 6 /* Argument */ ++#define MIPS_R_A3 7 /* Argument */ ++#define MIPS_R_A4 8 /* Arg (n64) */ ++#define MIPS_R_A5 9 /* Arg (n64) */ ++#define MIPS_R_A6 10 /* Arg (n64) */ ++#define MIPS_R_A7 11 /* Arg (n64) */ ++#define MIPS_R_T0 8 /* Temp (o32) */ ++#define MIPS_R_T1 9 /* Temp (o32) */ ++#define MIPS_R_T2 10 /* Temp (o32) */ ++#define MIPS_R_T3 11 /* Temp (o32) */ ++#define MIPS_R_T4 12 /* Temporary */ ++#define MIPS_R_T5 13 /* Temporary */ ++#define MIPS_R_T6 14 /* Temporary */ ++#define MIPS_R_T7 15 /* Temporary */ ++#define MIPS_R_S0 16 /* Saved */ ++#define MIPS_R_S1 17 /* Saved */ ++#define MIPS_R_S2 18 /* Saved */ ++#define MIPS_R_S3 19 /* Saved */ ++#define MIPS_R_S4 20 /* Saved */ ++#define MIPS_R_S5 21 /* Saved */ ++#define MIPS_R_S6 22 /* Saved */ ++#define MIPS_R_S7 23 /* Saved */ ++#define MIPS_R_T8 24 /* Temporary */ ++#define MIPS_R_T9 25 /* Temporary */ ++/* MIPS_R_K0 26 Reserved */ ++/* MIPS_R_K1 27 Reserved */ ++#define MIPS_R_GP 28 /* Global ptr */ ++#define MIPS_R_SP 29 /* Stack ptr */ ++#define MIPS_R_FP 30 /* Frame ptr */ ++#define MIPS_R_RA 31 /* Return */ ++ ++/* ++ * Jump address mask for immediate jumps. The four most significant bits ++ * must be equal to PC. ++ */ ++#define MIPS_JMP_MASK 0x0fffffffUL ++ ++/* Maximum number of iterations in offset table computation */ ++#define JIT_MAX_ITERATIONS 8 ++ ++/* ++ * Jump pseudo-instructions used internally ++ * for branch conversion and branch optimization. ++ */ ++#define JIT_JNSET 0xe0 ++#define JIT_JNOP 0xf0 ++ ++/* Descriptor flag for PC-relative branch conversion */ ++#define JIT_DESC_CONVERT BIT(31) ++ ++/* JIT context for an eBPF program */ ++struct jit_context { ++ struct bpf_prog *program; /* The eBPF program being JITed */ ++ u32 *descriptors; /* eBPF to JITed CPU insn descriptors */ ++ u32 *target; /* JITed code buffer */ ++ u32 bpf_index; /* Index of current BPF program insn */ ++ u32 jit_index; /* Index of current JIT target insn */ ++ u32 changes; /* Number of PC-relative branch conv */ ++ u32 accessed; /* Bit mask of read eBPF registers */ ++ u32 clobbered; /* Bit mask of modified CPU registers */ ++ u32 stack_size; /* Total allocated stack size in bytes */ ++ u32 saved_size; /* Size of callee-saved registers */ ++ u32 stack_used; /* Stack size used for function calls */ ++}; ++ ++/* Emit the instruction if the JIT memory space has been allocated */ ++#define emit(ctx, func, ...) \ ++do { \ ++ if ((ctx)->target != NULL) { \ ++ u32 *p = &(ctx)->target[ctx->jit_index]; \ ++ uasm_i_##func(&p, ##__VA_ARGS__); \ ++ } \ ++ (ctx)->jit_index++; \ ++} while (0) ++ ++/* ++ * Mark a BPF register as accessed, it needs to be ++ * initialized by the program if expected, e.g. FP. ++ */ ++static inline void access_reg(struct jit_context *ctx, u8 reg) ++{ ++ ctx->accessed |= BIT(reg); ++} ++ ++/* ++ * Mark a CPU register as clobbered, it needs to be ++ * saved/restored by the program if callee-saved. ++ */ ++static inline void clobber_reg(struct jit_context *ctx, u8 reg) ++{ ++ ctx->clobbered |= BIT(reg); ++} ++ ++/* ++ * Push registers on the stack, starting at a given depth from the stack ++ * pointer and increasing. The next depth to be written is returned. ++ */ ++int push_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth); ++ ++/* ++ * Pop registers from the stack, starting at a given depth from the stack ++ * pointer and increasing. The next depth to be read is returned. ++ */ ++int pop_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth); ++ ++/* Compute the 28-bit jump target address from a BPF program location */ ++int get_target(struct jit_context *ctx, u32 loc); ++ ++/* Compute the PC-relative offset to relative BPF program offset */ ++int get_offset(const struct jit_context *ctx, int off); ++ ++/* dst = imm (32-bit) */ ++void emit_mov_i(struct jit_context *ctx, u8 dst, s32 imm); ++ ++/* dst = src (32-bit) */ ++void emit_mov_r(struct jit_context *ctx, u8 dst, u8 src); ++ ++/* Validate ALU/ALU64 immediate range */ ++bool valid_alu_i(u8 op, s32 imm); ++ ++/* Rewrite ALU/ALU64 immediate operation */ ++bool rewrite_alu_i(u8 op, s32 imm, u8 *alu, s32 *val); ++ ++/* ALU immediate operation (32-bit) */ ++void emit_alu_i(struct jit_context *ctx, u8 dst, s32 imm, u8 op); ++ ++/* ALU register operation (32-bit) */ ++void emit_alu_r(struct jit_context *ctx, u8 dst, u8 src, u8 op); ++ ++/* Atomic read-modify-write (32-bit) */ ++void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code); ++ ++/* Atomic compare-and-exchange (32-bit) */ ++void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off); ++ ++/* Swap bytes and truncate a register word or half word */ ++void emit_bswap_r(struct jit_context *ctx, u8 dst, u32 width); ++ ++/* Validate JMP/JMP32 immediate range */ ++bool valid_jmp_i(u8 op, s32 imm); ++ ++/* Prepare a PC-relative jump operation with immediate conditional */ ++void setup_jmp_i(struct jit_context *ctx, s32 imm, u8 width, ++ u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off); ++ ++/* Prepare a PC-relative jump operation with register conditional */ ++void setup_jmp_r(struct jit_context *ctx, bool same_reg, ++ u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off); ++ ++/* Finish a PC-relative jump operation */ ++int finish_jmp(struct jit_context *ctx, u8 jit_op, s16 bpf_off); ++ ++/* Conditional JMP/JMP32 immediate */ ++void emit_jmp_i(struct jit_context *ctx, u8 dst, s32 imm, s32 off, u8 op); ++ ++/* Conditional JMP/JMP32 register */ ++void emit_jmp_r(struct jit_context *ctx, u8 dst, u8 src, s32 off, u8 op); ++ ++/* Jump always */ ++int emit_ja(struct jit_context *ctx, s16 off); ++ ++/* Jump to epilogue */ ++int emit_exit(struct jit_context *ctx); ++ ++/* ++ * Build program prologue to set up the stack and registers. ++ * This function is implemented separately for 32-bit and 64-bit JITs. ++ */ ++void build_prologue(struct jit_context *ctx); ++ ++/* ++ * Build the program epilogue to restore the stack and registers. ++ * This function is implemented separately for 32-bit and 64-bit JITs. ++ */ ++void build_epilogue(struct jit_context *ctx, int dest_reg); ++ ++/* ++ * Convert an eBPF instruction to native instruction, i.e ++ * JITs an eBPF instruction. ++ * Returns : ++ * 0 - Successfully JITed an 8-byte eBPF instruction ++ * >0 - Successfully JITed a 16-byte eBPF instruction ++ * <0 - Failed to JIT. ++ * This function is implemented separately for 32-bit and 64-bit JITs. ++ */ ++int build_insn(const struct bpf_insn *insn, struct jit_context *ctx); ++ ++#endif /* _BPF_JIT_COMP_H */ +--- /dev/null ++++ b/arch/mips/net/bpf_jit_comp32.c +@@ -0,0 +1,1741 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Just-In-Time compiler for eBPF bytecode on MIPS. ++ * Implementation of JIT functions for 32-bit CPUs. ++ * ++ * Copyright (c) 2021 Anyfi Networks AB. ++ * Author: Johan Almbladh ++ * ++ * Based on code and ideas from ++ * Copyright (c) 2017 Cavium, Inc. ++ * Copyright (c) 2017 Shubham Bansal ++ * Copyright (c) 2011 Mircea Gherzan ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bpf_jit_comp.h" ++ ++/* MIPS a4-a7 are not available in the o32 ABI */ ++#undef MIPS_R_A4 ++#undef MIPS_R_A5 ++#undef MIPS_R_A6 ++#undef MIPS_R_A7 ++ ++/* Stack is 8-byte aligned in o32 ABI */ ++#define MIPS_STACK_ALIGNMENT 8 ++ ++/* ++ * The top 16 bytes of a stack frame is reserved for the callee in O32 ABI. ++ * This corresponds to stack space for register arguments a0-a3. ++ */ ++#define JIT_RESERVED_STACK 16 ++ ++/* Temporary 64-bit register used by JIT */ ++#define JIT_REG_TMP MAX_BPF_JIT_REG ++ ++/* ++ * Number of prologue bytes to skip when doing a tail call. ++ * Tail call count (TCC) initialization (8 bytes) always, plus ++ * R0-to-v0 assignment (4 bytes) if big endian. ++ */ ++#ifdef __BIG_ENDIAN ++#define JIT_TCALL_SKIP 12 ++#else ++#define JIT_TCALL_SKIP 8 ++#endif ++ ++/* CPU registers holding the callee return value */ ++#define JIT_RETURN_REGS \ ++ (BIT(MIPS_R_V0) | \ ++ BIT(MIPS_R_V1)) ++ ++/* CPU registers arguments passed to callee directly */ ++#define JIT_ARG_REGS \ ++ (BIT(MIPS_R_A0) | \ ++ BIT(MIPS_R_A1) | \ ++ BIT(MIPS_R_A2) | \ ++ BIT(MIPS_R_A3)) ++ ++/* CPU register arguments passed to callee on stack */ ++#define JIT_STACK_REGS \ ++ (BIT(MIPS_R_T0) | \ ++ BIT(MIPS_R_T1) | \ ++ BIT(MIPS_R_T2) | \ ++ BIT(MIPS_R_T3) | \ ++ BIT(MIPS_R_T4) | \ ++ BIT(MIPS_R_T5)) ++ ++/* Caller-saved CPU registers */ ++#define JIT_CALLER_REGS \ ++ (JIT_RETURN_REGS | \ ++ JIT_ARG_REGS | \ ++ JIT_STACK_REGS) ++ ++/* Callee-saved CPU registers */ ++#define JIT_CALLEE_REGS \ ++ (BIT(MIPS_R_S0) | \ ++ BIT(MIPS_R_S1) | \ ++ BIT(MIPS_R_S2) | \ ++ BIT(MIPS_R_S3) | \ ++ BIT(MIPS_R_S4) | \ ++ BIT(MIPS_R_S5) | \ ++ BIT(MIPS_R_S6) | \ ++ BIT(MIPS_R_S7) | \ ++ BIT(MIPS_R_GP) | \ ++ BIT(MIPS_R_FP) | \ ++ BIT(MIPS_R_RA)) ++ ++/* ++ * Mapping of 64-bit eBPF registers to 32-bit native MIPS registers. ++ * ++ * 1) Native register pairs are ordered according to CPU endiannes, following ++ * the MIPS convention for passing 64-bit arguments and return values. ++ * 2) The eBPF return value, arguments and callee-saved registers are mapped ++ * to their native MIPS equivalents. ++ * 3) Since the 32 highest bits in the eBPF FP register are always zero, ++ * only one general-purpose register is actually needed for the mapping. ++ * We use the fp register for this purpose, and map the highest bits to ++ * the MIPS register r0 (zero). ++ * 4) We use the MIPS gp and at registers as internal temporary registers ++ * for constant blinding. The gp register is callee-saved. ++ * 5) One 64-bit temporary register is mapped for use when sign-extending ++ * immediate operands. MIPS registers t6-t9 are available to the JIT ++ * for as temporaries when implementing complex 64-bit operations. ++ * ++ * With this scheme all eBPF registers are being mapped to native MIPS ++ * registers without having to use any stack scratch space. The direct ++ * register mapping (2) simplifies the handling of function calls. ++ */ ++static const u8 bpf2mips32[][2] = { ++ /* Return value from in-kernel function, and exit value from eBPF */ ++ [BPF_REG_0] = {MIPS_R_V1, MIPS_R_V0}, ++ /* Arguments from eBPF program to in-kernel function */ ++ [BPF_REG_1] = {MIPS_R_A1, MIPS_R_A0}, ++ [BPF_REG_2] = {MIPS_R_A3, MIPS_R_A2}, ++ /* Remaining arguments, to be passed on the stack per O32 ABI */ ++ [BPF_REG_3] = {MIPS_R_T1, MIPS_R_T0}, ++ [BPF_REG_4] = {MIPS_R_T3, MIPS_R_T2}, ++ [BPF_REG_5] = {MIPS_R_T5, MIPS_R_T4}, ++ /* Callee-saved registers that in-kernel function will preserve */ ++ [BPF_REG_6] = {MIPS_R_S1, MIPS_R_S0}, ++ [BPF_REG_7] = {MIPS_R_S3, MIPS_R_S2}, ++ [BPF_REG_8] = {MIPS_R_S5, MIPS_R_S4}, ++ [BPF_REG_9] = {MIPS_R_S7, MIPS_R_S6}, ++ /* Read-only frame pointer to access the eBPF stack */ ++#ifdef __BIG_ENDIAN ++ [BPF_REG_FP] = {MIPS_R_FP, MIPS_R_ZERO}, ++#else ++ [BPF_REG_FP] = {MIPS_R_ZERO, MIPS_R_FP}, ++#endif ++ /* Temporary register for blinding constants */ ++ [BPF_REG_AX] = {MIPS_R_GP, MIPS_R_AT}, ++ /* Temporary register for internal JIT use */ ++ [JIT_REG_TMP] = {MIPS_R_T7, MIPS_R_T6}, ++}; ++ ++/* Get low CPU register for a 64-bit eBPF register mapping */ ++static inline u8 lo(const u8 reg[]) ++{ ++#ifdef __BIG_ENDIAN ++ return reg[0]; ++#else ++ return reg[1]; ++#endif ++} ++ ++/* Get high CPU register for a 64-bit eBPF register mapping */ ++static inline u8 hi(const u8 reg[]) ++{ ++#ifdef __BIG_ENDIAN ++ return reg[1]; ++#else ++ return reg[0]; ++#endif ++} ++ ++/* ++ * Mark a 64-bit CPU register pair as clobbered, it needs to be ++ * saved/restored by the program if callee-saved. ++ */ ++static void clobber_reg64(struct jit_context *ctx, const u8 reg[]) ++{ ++ clobber_reg(ctx, reg[0]); ++ clobber_reg(ctx, reg[1]); ++} ++ ++/* dst = imm (sign-extended) */ ++static void emit_mov_se_i64(struct jit_context *ctx, const u8 dst[], s32 imm) ++{ ++ emit_mov_i(ctx, lo(dst), imm); ++ if (imm < 0) ++ emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1); ++ else ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ clobber_reg64(ctx, dst); ++} ++ ++/* Zero extension, if verifier does not do it for us */ ++static void emit_zext_ver(struct jit_context *ctx, const u8 dst[]) ++{ ++ if (!ctx->program->aux->verifier_zext) { ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ clobber_reg(ctx, hi(dst)); ++ } ++} ++ ++/* Load delay slot, if ISA mandates it */ ++static void emit_load_delay(struct jit_context *ctx) ++{ ++ if (!cpu_has_mips_2_3_4_5_r) ++ emit(ctx, nop); ++} ++ ++/* ALU immediate operation (64-bit) */ ++static void emit_alu_i64(struct jit_context *ctx, ++ const u8 dst[], s32 imm, u8 op) ++{ ++ u8 src = MIPS_R_T6; ++ ++ /* ++ * ADD/SUB with all but the max negative imm can be handled by ++ * inverting the operation and the imm value, saving one insn. ++ */ ++ if (imm > S32_MIN && imm < 0) ++ switch (op) { ++ case BPF_ADD: ++ op = BPF_SUB; ++ imm = -imm; ++ break; ++ case BPF_SUB: ++ op = BPF_ADD; ++ imm = -imm; ++ break; ++ } ++ ++ /* Move immediate to temporary register */ ++ emit_mov_i(ctx, src, imm); ++ ++ switch (op) { ++ /* dst = dst + imm */ ++ case BPF_ADD: ++ emit(ctx, addu, lo(dst), lo(dst), src); ++ emit(ctx, sltu, MIPS_R_T9, lo(dst), src); ++ emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9); ++ if (imm < 0) ++ emit(ctx, addiu, hi(dst), hi(dst), -1); ++ break; ++ /* dst = dst - imm */ ++ case BPF_SUB: ++ emit(ctx, sltu, MIPS_R_T9, lo(dst), src); ++ emit(ctx, subu, lo(dst), lo(dst), src); ++ emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9); ++ if (imm < 0) ++ emit(ctx, addiu, hi(dst), hi(dst), 1); ++ break; ++ /* dst = dst | imm */ ++ case BPF_OR: ++ emit(ctx, or, lo(dst), lo(dst), src); ++ if (imm < 0) ++ emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1); ++ break; ++ /* dst = dst & imm */ ++ case BPF_AND: ++ emit(ctx, and, lo(dst), lo(dst), src); ++ if (imm >= 0) ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ break; ++ /* dst = dst ^ imm */ ++ case BPF_XOR: ++ emit(ctx, xor, lo(dst), lo(dst), src); ++ if (imm < 0) { ++ emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst)); ++ emit(ctx, addiu, hi(dst), hi(dst), -1); ++ } ++ break; ++ } ++ clobber_reg64(ctx, dst); ++} ++ ++/* ALU register operation (64-bit) */ ++static void emit_alu_r64(struct jit_context *ctx, ++ const u8 dst[], const u8 src[], u8 op) ++{ ++ switch (BPF_OP(op)) { ++ /* dst = dst + src */ ++ case BPF_ADD: ++ if (src == dst) { ++ emit(ctx, srl, MIPS_R_T9, lo(dst), 31); ++ emit(ctx, addu, lo(dst), lo(dst), lo(dst)); ++ } else { ++ emit(ctx, addu, lo(dst), lo(dst), lo(src)); ++ emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src)); ++ } ++ emit(ctx, addu, hi(dst), hi(dst), hi(src)); ++ emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9); ++ break; ++ /* dst = dst - src */ ++ case BPF_SUB: ++ emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src)); ++ emit(ctx, subu, lo(dst), lo(dst), lo(src)); ++ emit(ctx, subu, hi(dst), hi(dst), hi(src)); ++ emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9); ++ break; ++ /* dst = dst | src */ ++ case BPF_OR: ++ emit(ctx, or, lo(dst), lo(dst), lo(src)); ++ emit(ctx, or, hi(dst), hi(dst), hi(src)); ++ break; ++ /* dst = dst & src */ ++ case BPF_AND: ++ emit(ctx, and, lo(dst), lo(dst), lo(src)); ++ emit(ctx, and, hi(dst), hi(dst), hi(src)); ++ break; ++ /* dst = dst ^ src */ ++ case BPF_XOR: ++ emit(ctx, xor, lo(dst), lo(dst), lo(src)); ++ emit(ctx, xor, hi(dst), hi(dst), hi(src)); ++ break; ++ } ++ clobber_reg64(ctx, dst); ++} ++ ++/* ALU invert (64-bit) */ ++static void emit_neg_i64(struct jit_context *ctx, const u8 dst[]) ++{ ++ emit(ctx, sltu, MIPS_R_T9, MIPS_R_ZERO, lo(dst)); ++ emit(ctx, subu, lo(dst), MIPS_R_ZERO, lo(dst)); ++ emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst)); ++ emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9); ++ ++ clobber_reg64(ctx, dst); ++} ++ ++/* ALU shift immediate (64-bit) */ ++static void emit_shift_i64(struct jit_context *ctx, ++ const u8 dst[], u32 imm, u8 op) ++{ ++ switch (BPF_OP(op)) { ++ /* dst = dst << imm */ ++ case BPF_LSH: ++ if (imm < 32) { ++ emit(ctx, srl, MIPS_R_T9, lo(dst), 32 - imm); ++ emit(ctx, sll, lo(dst), lo(dst), imm); ++ emit(ctx, sll, hi(dst), hi(dst), imm); ++ emit(ctx, or, hi(dst), hi(dst), MIPS_R_T9); ++ } else { ++ emit(ctx, sll, hi(dst), lo(dst), imm - 32); ++ emit(ctx, move, lo(dst), MIPS_R_ZERO); ++ } ++ break; ++ /* dst = dst >> imm */ ++ case BPF_RSH: ++ if (imm < 32) { ++ emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm); ++ emit(ctx, srl, lo(dst), lo(dst), imm); ++ emit(ctx, srl, hi(dst), hi(dst), imm); ++ emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9); ++ } else { ++ emit(ctx, srl, lo(dst), hi(dst), imm - 32); ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ } ++ break; ++ /* dst = dst >> imm (arithmetic) */ ++ case BPF_ARSH: ++ if (imm < 32) { ++ emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm); ++ emit(ctx, srl, lo(dst), lo(dst), imm); ++ emit(ctx, sra, hi(dst), hi(dst), imm); ++ emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9); ++ } else { ++ emit(ctx, sra, lo(dst), hi(dst), imm - 32); ++ emit(ctx, sra, hi(dst), hi(dst), 31); ++ } ++ break; ++ } ++ clobber_reg64(ctx, dst); ++} ++ ++/* ALU shift register (64-bit) */ ++static void emit_shift_r64(struct jit_context *ctx, ++ const u8 dst[], u8 src, u8 op) ++{ ++ u8 t1 = MIPS_R_T8; ++ u8 t2 = MIPS_R_T9; ++ ++ emit(ctx, andi, t1, src, 32); /* t1 = src & 32 */ ++ emit(ctx, beqz, t1, 16); /* PC += 16 if t1 == 0 */ ++ emit(ctx, nor, t2, src, MIPS_R_ZERO); /* t2 = ~src (delay slot) */ ++ ++ switch (BPF_OP(op)) { ++ /* dst = dst << src */ ++ case BPF_LSH: ++ /* Next: shift >= 32 */ ++ emit(ctx, sllv, hi(dst), lo(dst), src); /* dh = dl << src */ ++ emit(ctx, move, lo(dst), MIPS_R_ZERO); /* dl = 0 */ ++ emit(ctx, b, 20); /* PC += 20 */ ++ /* +16: shift < 32 */ ++ emit(ctx, srl, t1, lo(dst), 1); /* t1 = dl >> 1 */ ++ emit(ctx, srlv, t1, t1, t2); /* t1 = t1 >> t2 */ ++ emit(ctx, sllv, lo(dst), lo(dst), src); /* dl = dl << src */ ++ emit(ctx, sllv, hi(dst), hi(dst), src); /* dh = dh << src */ ++ emit(ctx, or, hi(dst), hi(dst), t1); /* dh = dh | t1 */ ++ break; ++ /* dst = dst >> src */ ++ case BPF_RSH: ++ /* Next: shift >= 32 */ ++ emit(ctx, srlv, lo(dst), hi(dst), src); /* dl = dh >> src */ ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); /* dh = 0 */ ++ emit(ctx, b, 20); /* PC += 20 */ ++ /* +16: shift < 32 */ ++ emit(ctx, sll, t1, hi(dst), 1); /* t1 = dl << 1 */ ++ emit(ctx, sllv, t1, t1, t2); /* t1 = t1 << t2 */ ++ emit(ctx, srlv, lo(dst), lo(dst), src); /* dl = dl >> src */ ++ emit(ctx, srlv, hi(dst), hi(dst), src); /* dh = dh >> src */ ++ emit(ctx, or, lo(dst), lo(dst), t1); /* dl = dl | t1 */ ++ break; ++ /* dst = dst >> src (arithmetic) */ ++ case BPF_ARSH: ++ /* Next: shift >= 32 */ ++ emit(ctx, srav, lo(dst), hi(dst), src); /* dl = dh >>a src */ ++ emit(ctx, sra, hi(dst), hi(dst), 31); /* dh = dh >>a 31 */ ++ emit(ctx, b, 20); /* PC += 20 */ ++ /* +16: shift < 32 */ ++ emit(ctx, sll, t1, hi(dst), 1); /* t1 = dl << 1 */ ++ emit(ctx, sllv, t1, t1, t2); /* t1 = t1 << t2 */ ++ emit(ctx, srlv, lo(dst), lo(dst), src); /* dl = dl >>a src */ ++ emit(ctx, srav, hi(dst), hi(dst), src); /* dh = dh >> src */ ++ emit(ctx, or, lo(dst), lo(dst), t1); /* dl = dl | t1 */ ++ break; ++ } ++ ++ /* +20: Done */ ++ clobber_reg64(ctx, dst); ++} ++ ++/* ALU mul immediate (64x32-bit) */ ++static void emit_mul_i64(struct jit_context *ctx, const u8 dst[], s32 imm) ++{ ++ u8 src = MIPS_R_T6; ++ u8 tmp = MIPS_R_T9; ++ ++ switch (imm) { ++ /* dst = dst * 1 is a no-op */ ++ case 1: ++ break; ++ /* dst = dst * -1 */ ++ case -1: ++ emit_neg_i64(ctx, dst); ++ break; ++ case 0: ++ emit_mov_r(ctx, lo(dst), MIPS_R_ZERO); ++ emit_mov_r(ctx, hi(dst), MIPS_R_ZERO); ++ break; ++ /* Full 64x32 multiply */ ++ default: ++ /* hi(dst) = hi(dst) * src(imm) */ ++ emit_mov_i(ctx, src, imm); ++ if (cpu_has_mips32r1 || cpu_has_mips32r6) { ++ emit(ctx, mul, hi(dst), hi(dst), src); ++ } else { ++ emit(ctx, multu, hi(dst), src); ++ emit(ctx, mflo, hi(dst)); ++ } ++ ++ /* hi(dst) = hi(dst) - lo(dst) */ ++ if (imm < 0) ++ emit(ctx, subu, hi(dst), hi(dst), lo(dst)); ++ ++ /* tmp = lo(dst) * src(imm) >> 32 */ ++ /* lo(dst) = lo(dst) * src(imm) */ ++ if (cpu_has_mips32r6) { ++ emit(ctx, muhu, tmp, lo(dst), src); ++ emit(ctx, mulu, lo(dst), lo(dst), src); ++ } else { ++ emit(ctx, multu, lo(dst), src); ++ emit(ctx, mflo, lo(dst)); ++ emit(ctx, mfhi, tmp); ++ } ++ ++ /* hi(dst) += tmp */ ++ emit(ctx, addu, hi(dst), hi(dst), tmp); ++ clobber_reg64(ctx, dst); ++ break; ++ } ++} ++ ++/* ALU mul register (64x64-bit) */ ++static void emit_mul_r64(struct jit_context *ctx, ++ const u8 dst[], const u8 src[]) ++{ ++ u8 acc = MIPS_R_T8; ++ u8 tmp = MIPS_R_T9; ++ ++ /* acc = hi(dst) * lo(src) */ ++ if (cpu_has_mips32r1 || cpu_has_mips32r6) { ++ emit(ctx, mul, acc, hi(dst), lo(src)); ++ } else { ++ emit(ctx, multu, hi(dst), lo(src)); ++ emit(ctx, mflo, acc); ++ } ++ ++ /* tmp = lo(dst) * hi(src) */ ++ if (cpu_has_mips32r1 || cpu_has_mips32r6) { ++ emit(ctx, mul, tmp, lo(dst), hi(src)); ++ } else { ++ emit(ctx, multu, lo(dst), hi(src)); ++ emit(ctx, mflo, tmp); ++ } ++ ++ /* acc += tmp */ ++ emit(ctx, addu, acc, acc, tmp); ++ ++ /* tmp = lo(dst) * lo(src) >> 32 */ ++ /* lo(dst) = lo(dst) * lo(src) */ ++ if (cpu_has_mips32r6) { ++ emit(ctx, muhu, tmp, lo(dst), lo(src)); ++ emit(ctx, mulu, lo(dst), lo(dst), lo(src)); ++ } else { ++ emit(ctx, multu, lo(dst), lo(src)); ++ emit(ctx, mflo, lo(dst)); ++ emit(ctx, mfhi, tmp); ++ } ++ ++ /* hi(dst) = acc + tmp */ ++ emit(ctx, addu, hi(dst), acc, tmp); ++ clobber_reg64(ctx, dst); ++} ++ ++/* Helper function for 64-bit modulo */ ++static u64 jit_mod64(u64 a, u64 b) ++{ ++ u64 rem; ++ ++ div64_u64_rem(a, b, &rem); ++ return rem; ++} ++ ++/* ALU div/mod register (64-bit) */ ++static void emit_divmod_r64(struct jit_context *ctx, ++ const u8 dst[], const u8 src[], u8 op) ++{ ++ const u8 *r0 = bpf2mips32[BPF_REG_0]; /* Mapped to v0-v1 */ ++ const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */ ++ const u8 *r2 = bpf2mips32[BPF_REG_2]; /* Mapped to a2-a3 */ ++ int exclude, k; ++ u32 addr = 0; ++ ++ /* Push caller-saved registers on stack */ ++ push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, ++ 0, JIT_RESERVED_STACK); ++ ++ /* Put 64-bit arguments 1 and 2 in registers a0-a3 */ ++ for (k = 0; k < 2; k++) { ++ emit(ctx, move, MIPS_R_T9, src[k]); ++ emit(ctx, move, r1[k], dst[k]); ++ emit(ctx, move, r2[k], MIPS_R_T9); ++ } ++ ++ /* Emit function call */ ++ switch (BPF_OP(op)) { ++ /* dst = dst / src */ ++ case BPF_DIV: ++ addr = (u32)&div64_u64; ++ break; ++ /* dst = dst % src */ ++ case BPF_MOD: ++ addr = (u32)&jit_mod64; ++ break; ++ } ++ emit_mov_i(ctx, MIPS_R_T9, addr); ++ emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9); ++ emit(ctx, nop); /* Delay slot */ ++ ++ /* Store the 64-bit result in dst */ ++ emit(ctx, move, dst[0], r0[0]); ++ emit(ctx, move, dst[1], r0[1]); ++ ++ /* Restore caller-saved registers, excluding the computed result */ ++ exclude = BIT(lo(dst)) | BIT(hi(dst)); ++ pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, ++ exclude, JIT_RESERVED_STACK); ++ emit_load_delay(ctx); ++ ++ clobber_reg64(ctx, dst); ++ clobber_reg(ctx, MIPS_R_V0); ++ clobber_reg(ctx, MIPS_R_V1); ++ clobber_reg(ctx, MIPS_R_RA); ++} ++ ++/* Swap bytes in a register word */ ++static void emit_swap8_r(struct jit_context *ctx, u8 dst, u8 src, u8 mask) ++{ ++ u8 tmp = MIPS_R_T9; ++ ++ emit(ctx, and, tmp, src, mask); /* tmp = src & 0x00ff00ff */ ++ emit(ctx, sll, tmp, tmp, 8); /* tmp = tmp << 8 */ ++ emit(ctx, srl, dst, src, 8); /* dst = src >> 8 */ ++ emit(ctx, and, dst, dst, mask); /* dst = dst & 0x00ff00ff */ ++ emit(ctx, or, dst, dst, tmp); /* dst = dst | tmp */ ++} ++ ++/* Swap half words in a register word */ ++static void emit_swap16_r(struct jit_context *ctx, u8 dst, u8 src) ++{ ++ u8 tmp = MIPS_R_T9; ++ ++ emit(ctx, sll, tmp, src, 16); /* tmp = src << 16 */ ++ emit(ctx, srl, dst, src, 16); /* dst = src >> 16 */ ++ emit(ctx, or, dst, dst, tmp); /* dst = dst | tmp */ ++} ++ ++/* Swap bytes and truncate a register double word, word or half word */ ++static void emit_bswap_r64(struct jit_context *ctx, const u8 dst[], u32 width) ++{ ++ u8 tmp = MIPS_R_T8; ++ ++ switch (width) { ++ /* Swap bytes in a double word */ ++ case 64: ++ if (cpu_has_mips32r2 || cpu_has_mips32r6) { ++ emit(ctx, rotr, tmp, hi(dst), 16); ++ emit(ctx, rotr, hi(dst), lo(dst), 16); ++ emit(ctx, wsbh, lo(dst), tmp); ++ emit(ctx, wsbh, hi(dst), hi(dst)); ++ } else { ++ emit_swap16_r(ctx, tmp, lo(dst)); ++ emit_swap16_r(ctx, lo(dst), hi(dst)); ++ emit(ctx, move, hi(dst), tmp); ++ ++ emit(ctx, lui, tmp, 0xff); /* tmp = 0x00ff0000 */ ++ emit(ctx, ori, tmp, tmp, 0xff); /* tmp = 0x00ff00ff */ ++ emit_swap8_r(ctx, lo(dst), lo(dst), tmp); ++ emit_swap8_r(ctx, hi(dst), hi(dst), tmp); ++ } ++ break; ++ /* Swap bytes in a word */ ++ /* Swap bytes in a half word */ ++ case 32: ++ case 16: ++ emit_bswap_r(ctx, lo(dst), width); ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ break; ++ } ++ clobber_reg64(ctx, dst); ++} ++ ++/* Truncate a register double word, word or half word */ ++static void emit_trunc_r64(struct jit_context *ctx, const u8 dst[], u32 width) ++{ ++ switch (width) { ++ case 64: ++ break; ++ /* Zero-extend a word */ ++ case 32: ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ clobber_reg(ctx, hi(dst)); ++ break; ++ /* Zero-extend a half word */ ++ case 16: ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ emit(ctx, andi, lo(dst), lo(dst), 0xffff); ++ clobber_reg64(ctx, dst); ++ break; ++ } ++} ++ ++/* Load operation: dst = *(size*)(src + off) */ ++static void emit_ldx(struct jit_context *ctx, ++ const u8 dst[], u8 src, s16 off, u8 size) ++{ ++ switch (size) { ++ /* Load a byte */ ++ case BPF_B: ++ emit(ctx, lbu, lo(dst), off, src); ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ break; ++ /* Load a half word */ ++ case BPF_H: ++ emit(ctx, lhu, lo(dst), off, src); ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ break; ++ /* Load a word */ ++ case BPF_W: ++ emit(ctx, lw, lo(dst), off, src); ++ emit(ctx, move, hi(dst), MIPS_R_ZERO); ++ break; ++ /* Load a double word */ ++ case BPF_DW: ++ if (dst[1] == src) { ++ emit(ctx, lw, dst[0], off + 4, src); ++ emit(ctx, lw, dst[1], off, src); ++ } else { ++ emit(ctx, lw, dst[1], off, src); ++ emit(ctx, lw, dst[0], off + 4, src); ++ } ++ emit_load_delay(ctx); ++ break; ++ } ++ clobber_reg64(ctx, dst); ++} ++ ++/* Store operation: *(size *)(dst + off) = src */ ++static void emit_stx(struct jit_context *ctx, ++ const u8 dst, const u8 src[], s16 off, u8 size) ++{ ++ switch (size) { ++ /* Store a byte */ ++ case BPF_B: ++ emit(ctx, sb, lo(src), off, dst); ++ break; ++ /* Store a half word */ ++ case BPF_H: ++ emit(ctx, sh, lo(src), off, dst); ++ break; ++ /* Store a word */ ++ case BPF_W: ++ emit(ctx, sw, lo(src), off, dst); ++ break; ++ /* Store a double word */ ++ case BPF_DW: ++ emit(ctx, sw, src[1], off, dst); ++ emit(ctx, sw, src[0], off + 4, dst); ++ break; ++ } ++} ++ ++/* Atomic read-modify-write (32-bit, non-ll/sc fallback) */ ++static void emit_atomic_r32(struct jit_context *ctx, ++ u8 dst, u8 src, s16 off, u8 code) ++{ ++ u32 exclude = 0; ++ u32 addr = 0; ++ ++ /* Push caller-saved registers on stack */ ++ push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, ++ 0, JIT_RESERVED_STACK); ++ /* ++ * Argument 1: dst+off if xchg, otherwise src, passed in register a0 ++ * Argument 2: src if xchg, othersize dst+off, passed in register a1 ++ */ ++ emit(ctx, move, MIPS_R_T9, dst); ++ emit(ctx, move, MIPS_R_A0, src); ++ emit(ctx, addiu, MIPS_R_A1, MIPS_R_T9, off); ++ ++ /* Emit function call */ ++ switch (code) { ++ case BPF_ADD: ++ addr = (u32)&atomic_add; ++ break; ++ case BPF_SUB: ++ addr = (u32)&atomic_sub; ++ break; ++ case BPF_OR: ++ addr = (u32)&atomic_or; ++ break; ++ case BPF_AND: ++ addr = (u32)&atomic_and; ++ break; ++ case BPF_XOR: ++ addr = (u32)&atomic_xor; ++ break; ++ } ++ emit_mov_i(ctx, MIPS_R_T9, addr); ++ emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9); ++ emit(ctx, nop); /* Delay slot */ ++ ++ /* Restore caller-saved registers, except any fetched value */ ++ pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, ++ exclude, JIT_RESERVED_STACK); ++ emit_load_delay(ctx); ++ clobber_reg(ctx, MIPS_R_RA); ++} ++ ++/* Atomic read-modify-write (64-bit) */ ++static void emit_atomic_r64(struct jit_context *ctx, ++ u8 dst, const u8 src[], s16 off, u8 code) ++{ ++ const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */ ++ u32 exclude = 0; ++ u32 addr = 0; ++ ++ /* Push caller-saved registers on stack */ ++ push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, ++ 0, JIT_RESERVED_STACK); ++ /* ++ * Argument 1: 64-bit src, passed in registers a0-a1 ++ * Argument 2: 32-bit dst+off, passed in register a2 ++ */ ++ emit(ctx, move, MIPS_R_T9, dst); ++ emit(ctx, move, r1[0], src[0]); ++ emit(ctx, move, r1[1], src[1]); ++ emit(ctx, addiu, MIPS_R_A2, MIPS_R_T9, off); ++ ++ /* Emit function call */ ++ switch (code) { ++ case BPF_ADD: ++ addr = (u32)&atomic64_add; ++ break; ++ case BPF_SUB: ++ addr = (u32)&atomic64_sub; ++ break; ++ case BPF_OR: ++ addr = (u32)&atomic64_or; ++ break; ++ case BPF_AND: ++ addr = (u32)&atomic64_and; ++ break; ++ case BPF_XOR: ++ addr = (u32)&atomic64_xor; ++ break; ++ } ++ emit_mov_i(ctx, MIPS_R_T9, addr); ++ emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9); ++ emit(ctx, nop); /* Delay slot */ ++ ++ /* Restore caller-saved registers, except any fetched value */ ++ pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, ++ exclude, JIT_RESERVED_STACK); ++ emit_load_delay(ctx); ++ clobber_reg(ctx, MIPS_R_RA); ++} ++ ++/* ++ * Conditional movz or an emulated equivalent. ++ * Note that the rs register may be modified. ++ */ ++static void emit_movz_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt) ++{ ++ if (cpu_has_mips_2) { ++ emit(ctx, movz, rd, rs, rt); /* rd = rt ? rd : rs */ ++ } else if (cpu_has_mips32r6) { ++ if (rs != MIPS_R_ZERO) ++ emit(ctx, seleqz, rs, rs, rt); /* rs = 0 if rt == 0 */ ++ emit(ctx, selnez, rd, rd, rt); /* rd = 0 if rt != 0 */ ++ if (rs != MIPS_R_ZERO) ++ emit(ctx, or, rd, rd, rs); /* rd = rd | rs */ ++ } else { ++ emit(ctx, bnez, rt, 8); /* PC += 8 if rd != 0 */ ++ emit(ctx, nop); /* +0: delay slot */ ++ emit(ctx, or, rd, rs, MIPS_R_ZERO); /* +4: rd = rs */ ++ } ++ clobber_reg(ctx, rd); ++ clobber_reg(ctx, rs); ++} ++ ++/* ++ * Conditional movn or an emulated equivalent. ++ * Note that the rs register may be modified. ++ */ ++static void emit_movn_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt) ++{ ++ if (cpu_has_mips_2) { ++ emit(ctx, movn, rd, rs, rt); /* rd = rt ? rs : rd */ ++ } else if (cpu_has_mips32r6) { ++ if (rs != MIPS_R_ZERO) ++ emit(ctx, selnez, rs, rs, rt); /* rs = 0 if rt == 0 */ ++ emit(ctx, seleqz, rd, rd, rt); /* rd = 0 if rt != 0 */ ++ if (rs != MIPS_R_ZERO) ++ emit(ctx, or, rd, rd, rs); /* rd = rd | rs */ ++ } else { ++ emit(ctx, beqz, rt, 8); /* PC += 8 if rd == 0 */ ++ emit(ctx, nop); /* +0: delay slot */ ++ emit(ctx, or, rd, rs, MIPS_R_ZERO); /* +4: rd = rs */ ++ } ++ clobber_reg(ctx, rd); ++ clobber_reg(ctx, rs); ++} ++ ++/* Emulation of 64-bit sltiu rd, rs, imm, where imm may be S32_MAX + 1 */ ++static void emit_sltiu_r64(struct jit_context *ctx, u8 rd, ++ const u8 rs[], s64 imm) ++{ ++ u8 tmp = MIPS_R_T9; ++ ++ if (imm < 0) { ++ emit_mov_i(ctx, rd, imm); /* rd = imm */ ++ emit(ctx, sltu, rd, lo(rs), rd); /* rd = rsl < rd */ ++ emit(ctx, sltiu, tmp, hi(rs), -1); /* tmp = rsh < ~0U */ ++ emit(ctx, or, rd, rd, tmp); /* rd = rd | tmp */ ++ } else { /* imm >= 0 */ ++ if (imm > 0x7fff) { ++ emit_mov_i(ctx, rd, (s32)imm); /* rd = imm */ ++ emit(ctx, sltu, rd, lo(rs), rd); /* rd = rsl < rd */ ++ } else { ++ emit(ctx, sltiu, rd, lo(rs), imm); /* rd = rsl < imm */ ++ } ++ emit_movn_r(ctx, rd, MIPS_R_ZERO, hi(rs)); /* rd = 0 if rsh */ ++ } ++} ++ ++/* Emulation of 64-bit sltu rd, rs, rt */ ++static void emit_sltu_r64(struct jit_context *ctx, u8 rd, ++ const u8 rs[], const u8 rt[]) ++{ ++ u8 tmp = MIPS_R_T9; ++ ++ emit(ctx, sltu, rd, lo(rs), lo(rt)); /* rd = rsl < rtl */ ++ emit(ctx, subu, tmp, hi(rs), hi(rt)); /* tmp = rsh - rth */ ++ emit_movn_r(ctx, rd, MIPS_R_ZERO, tmp); /* rd = 0 if tmp != 0 */ ++ emit(ctx, sltu, tmp, hi(rs), hi(rt)); /* tmp = rsh < rth */ ++ emit(ctx, or, rd, rd, tmp); /* rd = rd | tmp */ ++} ++ ++/* Emulation of 64-bit slti rd, rs, imm, where imm may be S32_MAX + 1 */ ++static void emit_slti_r64(struct jit_context *ctx, u8 rd, ++ const u8 rs[], s64 imm) ++{ ++ u8 t1 = MIPS_R_T8; ++ u8 t2 = MIPS_R_T9; ++ u8 cmp; ++ ++ /* ++ * if ((rs < 0) ^ (imm < 0)) t1 = imm >u rsl ++ * else t1 = rsl > 31 */ ++ if (imm < 0) ++ emit_movz_r(ctx, t1, t2, rd); /* t1 = rd ? t1 : t2 */ ++ else ++ emit_movn_r(ctx, t1, t2, rd); /* t1 = rd ? t2 : t1 */ ++ /* ++ * if ((imm < 0 && rsh != 0xffffffff) || ++ * (imm >= 0 && rsh != 0)) ++ * t1 = 0 ++ */ ++ if (imm < 0) { ++ emit(ctx, addiu, rd, hi(rs), 1); /* rd = rsh + 1 */ ++ cmp = rd; ++ } else { /* imm >= 0 */ ++ cmp = hi(rs); ++ } ++ emit_movn_r(ctx, t1, MIPS_R_ZERO, cmp); /* t1 = 0 if cmp != 0 */ ++ ++ /* ++ * if (imm < 0) rd = rsh < -1 ++ * else rd = rsh != 0 ++ * rd = rd | t1 ++ */ ++ emit(ctx, slti, rd, hi(rs), imm < 0 ? -1 : 0); /* rd = rsh < hi(imm) */ ++ emit(ctx, or, rd, rd, t1); /* rd = rd | t1 */ ++} ++ ++/* Emulation of 64-bit(slt rd, rs, rt) */ ++static void emit_slt_r64(struct jit_context *ctx, u8 rd, ++ const u8 rs[], const u8 rt[]) ++{ ++ u8 t1 = MIPS_R_T7; ++ u8 t2 = MIPS_R_T8; ++ u8 t3 = MIPS_R_T9; ++ ++ /* ++ * if ((rs < 0) ^ (rt < 0)) t1 = rtl > 31 */ ++ emit_movn_r(ctx, t1, t2, rd); /* t1 = rd ? t2 : t1 */ ++ emit_movn_r(ctx, t1, MIPS_R_ZERO, t3); /* t1 = 0 if t3 != 0 */ ++ ++ /* rd = (rsh < rth) | t1 */ ++ emit(ctx, slt, rd, hi(rs), hi(rt)); /* rd = rsh = -0x7fff && imm <= 0x8000) { ++ emit(ctx, addiu, tmp, lo(dst), -imm); ++ } else if ((u32)imm <= 0xffff) { ++ emit(ctx, xori, tmp, lo(dst), imm); ++ } else { /* Register fallback */ ++ emit_mov_i(ctx, tmp, imm); ++ emit(ctx, xor, tmp, lo(dst), tmp); ++ } ++ if (imm < 0) { /* Compare sign extension */ ++ emit(ctx, addu, MIPS_R_T9, hi(dst), 1); ++ emit(ctx, or, tmp, tmp, MIPS_R_T9); ++ } else { /* Compare zero extension */ ++ emit(ctx, or, tmp, tmp, hi(dst)); ++ } ++ if (op == BPF_JEQ) ++ emit(ctx, beqz, tmp, off); ++ else /* BPF_JNE */ ++ emit(ctx, bnez, tmp, off); ++ break; ++ /* PC += off if dst & imm */ ++ /* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */ ++ case BPF_JSET: ++ case JIT_JNSET: ++ if ((u32)imm <= 0xffff) { ++ emit(ctx, andi, tmp, lo(dst), imm); ++ } else { /* Register fallback */ ++ emit_mov_i(ctx, tmp, imm); ++ emit(ctx, and, tmp, lo(dst), tmp); ++ } ++ if (imm < 0) /* Sign-extension pulls in high word */ ++ emit(ctx, or, tmp, tmp, hi(dst)); ++ if (op == BPF_JSET) ++ emit(ctx, bnez, tmp, off); ++ else /* JIT_JNSET */ ++ emit(ctx, beqz, tmp, off); ++ break; ++ /* PC += off if dst > imm */ ++ case BPF_JGT: ++ emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1); ++ emit(ctx, beqz, tmp, off); ++ break; ++ /* PC += off if dst >= imm */ ++ case BPF_JGE: ++ emit_sltiu_r64(ctx, tmp, dst, imm); ++ emit(ctx, beqz, tmp, off); ++ break; ++ /* PC += off if dst < imm */ ++ case BPF_JLT: ++ emit_sltiu_r64(ctx, tmp, dst, imm); ++ emit(ctx, bnez, tmp, off); ++ break; ++ /* PC += off if dst <= imm */ ++ case BPF_JLE: ++ emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1); ++ emit(ctx, bnez, tmp, off); ++ break; ++ /* PC += off if dst > imm (signed) */ ++ case BPF_JSGT: ++ emit_slti_r64(ctx, tmp, dst, (s64)imm + 1); ++ emit(ctx, beqz, tmp, off); ++ break; ++ /* PC += off if dst >= imm (signed) */ ++ case BPF_JSGE: ++ emit_slti_r64(ctx, tmp, dst, imm); ++ emit(ctx, beqz, tmp, off); ++ break; ++ /* PC += off if dst < imm (signed) */ ++ case BPF_JSLT: ++ emit_slti_r64(ctx, tmp, dst, imm); ++ emit(ctx, bnez, tmp, off); ++ break; ++ /* PC += off if dst <= imm (signed) */ ++ case BPF_JSLE: ++ emit_slti_r64(ctx, tmp, dst, (s64)imm + 1); ++ emit(ctx, bnez, tmp, off); ++ break; ++ } ++} ++ ++/* Jump register (64-bit) */ ++static void emit_jmp_r64(struct jit_context *ctx, ++ const u8 dst[], const u8 src[], s32 off, u8 op) ++{ ++ u8 t1 = MIPS_R_T6; ++ u8 t2 = MIPS_R_T7; ++ ++ switch (op) { ++ /* No-op, used internally for branch optimization */ ++ case JIT_JNOP: ++ break; ++ /* PC += off if dst == src */ ++ /* PC += off if dst != src */ ++ case BPF_JEQ: ++ case BPF_JNE: ++ emit(ctx, subu, t1, lo(dst), lo(src)); ++ emit(ctx, subu, t2, hi(dst), hi(src)); ++ emit(ctx, or, t1, t1, t2); ++ if (op == BPF_JEQ) ++ emit(ctx, beqz, t1, off); ++ else /* BPF_JNE */ ++ emit(ctx, bnez, t1, off); ++ break; ++ /* PC += off if dst & src */ ++ /* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */ ++ case BPF_JSET: ++ case JIT_JNSET: ++ emit(ctx, and, t1, lo(dst), lo(src)); ++ emit(ctx, and, t2, hi(dst), hi(src)); ++ emit(ctx, or, t1, t1, t2); ++ if (op == BPF_JSET) ++ emit(ctx, bnez, t1, off); ++ else /* JIT_JNSET */ ++ emit(ctx, beqz, t1, off); ++ break; ++ /* PC += off if dst > src */ ++ case BPF_JGT: ++ emit_sltu_r64(ctx, t1, src, dst); ++ emit(ctx, bnez, t1, off); ++ break; ++ /* PC += off if dst >= src */ ++ case BPF_JGE: ++ emit_sltu_r64(ctx, t1, dst, src); ++ emit(ctx, beqz, t1, off); ++ break; ++ /* PC += off if dst < src */ ++ case BPF_JLT: ++ emit_sltu_r64(ctx, t1, dst, src); ++ emit(ctx, bnez, t1, off); ++ break; ++ /* PC += off if dst <= src */ ++ case BPF_JLE: ++ emit_sltu_r64(ctx, t1, src, dst); ++ emit(ctx, beqz, t1, off); ++ break; ++ /* PC += off if dst > src (signed) */ ++ case BPF_JSGT: ++ emit_slt_r64(ctx, t1, src, dst); ++ emit(ctx, bnez, t1, off); ++ break; ++ /* PC += off if dst >= src (signed) */ ++ case BPF_JSGE: ++ emit_slt_r64(ctx, t1, dst, src); ++ emit(ctx, beqz, t1, off); ++ break; ++ /* PC += off if dst < src (signed) */ ++ case BPF_JSLT: ++ emit_slt_r64(ctx, t1, dst, src); ++ emit(ctx, bnez, t1, off); ++ break; ++ /* PC += off if dst <= src (signed) */ ++ case BPF_JSLE: ++ emit_slt_r64(ctx, t1, src, dst); ++ emit(ctx, beqz, t1, off); ++ break; ++ } ++} ++ ++/* Function call */ ++static int emit_call(struct jit_context *ctx, const struct bpf_insn *insn) ++{ ++ bool fixed; ++ u64 addr; ++ ++ /* Decode the call address */ ++ if (bpf_jit_get_func_addr(ctx->program, insn, false, ++ &addr, &fixed) < 0) ++ return -1; ++ if (!fixed) ++ return -1; ++ ++ /* Push stack arguments */ ++ push_regs(ctx, JIT_STACK_REGS, 0, JIT_RESERVED_STACK); ++ ++ /* Emit function call */ ++ emit_mov_i(ctx, MIPS_R_T9, addr); ++ emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9); ++ emit(ctx, nop); /* Delay slot */ ++ ++ clobber_reg(ctx, MIPS_R_RA); ++ clobber_reg(ctx, MIPS_R_V0); ++ clobber_reg(ctx, MIPS_R_V1); ++ return 0; ++} ++ ++/* Function tail call */ ++static int emit_tail_call(struct jit_context *ctx) ++{ ++ u8 ary = lo(bpf2mips32[BPF_REG_2]); ++ u8 ind = lo(bpf2mips32[BPF_REG_3]); ++ u8 t1 = MIPS_R_T8; ++ u8 t2 = MIPS_R_T9; ++ int off; ++ ++ /* ++ * Tail call: ++ * eBPF R1 - function argument (context ptr), passed in a0-a1 ++ * eBPF R2 - ptr to object with array of function entry points ++ * eBPF R3 - array index of function to be called ++ * stack[sz] - remaining tail call count, initialized in prologue ++ */ ++ ++ /* if (ind >= ary->map.max_entries) goto out */ ++ off = offsetof(struct bpf_array, map.max_entries); ++ if (off > 0x7fff) ++ return -1; ++ emit(ctx, lw, t1, off, ary); /* t1 = ary->map.max_entries*/ ++ emit_load_delay(ctx); /* Load delay slot */ ++ emit(ctx, sltu, t1, ind, t1); /* t1 = ind < t1 */ ++ emit(ctx, beqz, t1, get_offset(ctx, 1)); /* PC += off(1) if t1 == 0 */ ++ /* (next insn delay slot) */ ++ /* if (TCC-- <= 0) goto out */ ++ emit(ctx, lw, t2, ctx->stack_size, MIPS_R_SP); /* t2 = *(SP + size) */ ++ emit_load_delay(ctx); /* Load delay slot */ ++ emit(ctx, blez, t2, get_offset(ctx, 1)); /* PC += off(1) if t2 < 0 */ ++ emit(ctx, addiu, t2, t2, -1); /* t2-- (delay slot) */ ++ emit(ctx, sw, t2, ctx->stack_size, MIPS_R_SP); /* *(SP + size) = t2 */ ++ ++ /* prog = ary->ptrs[ind] */ ++ off = offsetof(struct bpf_array, ptrs); ++ if (off > 0x7fff) ++ return -1; ++ emit(ctx, sll, t1, ind, 2); /* t1 = ind << 2 */ ++ emit(ctx, addu, t1, t1, ary); /* t1 += ary */ ++ emit(ctx, lw, t2, off, t1); /* t2 = *(t1 + off) */ ++ emit_load_delay(ctx); /* Load delay slot */ ++ ++ /* if (prog == 0) goto out */ ++ emit(ctx, beqz, t2, get_offset(ctx, 1)); /* PC += off(1) if t2 == 0 */ ++ emit(ctx, nop); /* Delay slot */ ++ ++ /* func = prog->bpf_func + 8 (prologue skip offset) */ ++ off = offsetof(struct bpf_prog, bpf_func); ++ if (off > 0x7fff) ++ return -1; ++ emit(ctx, lw, t1, off, t2); /* t1 = *(t2 + off) */ ++ emit_load_delay(ctx); /* Load delay slot */ ++ emit(ctx, addiu, t1, t1, JIT_TCALL_SKIP); /* t1 += skip (8 or 12) */ ++ ++ /* goto func */ ++ build_epilogue(ctx, t1); ++ return 0; ++} ++ ++/* ++ * Stack frame layout for a JITed program (stack grows down). ++ * ++ * Higher address : Caller's stack frame : ++ * :----------------------------: ++ * : 64-bit eBPF args r3-r5 : ++ * :----------------------------: ++ * : Reserved / tail call count : ++ * +============================+ <--- MIPS sp before call ++ * | Callee-saved registers, | ++ * | including RA and FP | ++ * +----------------------------+ <--- eBPF FP (MIPS zero,fp) ++ * | Local eBPF variables | ++ * | allocated by program | ++ * +----------------------------+ ++ * | Reserved for caller-saved | ++ * | registers | ++ * +----------------------------+ ++ * | Reserved for 64-bit eBPF | ++ * | args r3-r5 & args passed | ++ * | on stack in kernel calls | ++ * Lower address +============================+ <--- MIPS sp ++ */ ++ ++/* Build program prologue to set up the stack and registers */ ++void build_prologue(struct jit_context *ctx) ++{ ++ const u8 *r1 = bpf2mips32[BPF_REG_1]; ++ const u8 *fp = bpf2mips32[BPF_REG_FP]; ++ int stack, saved, locals, reserved; ++ ++ /* ++ * The first two instructions initialize TCC in the reserved (for us) ++ * 16-byte area in the parent's stack frame. On a tail call, the ++ * calling function jumps into the prologue after these instructions. ++ */ ++ emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, ++ min(MAX_TAIL_CALL_CNT + 1, 0xffff)); ++ emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP); ++ ++ /* ++ * Register eBPF R1 contains the 32-bit context pointer argument. ++ * A 32-bit argument is always passed in MIPS register a0, regardless ++ * of CPU endianness. Initialize R1 accordingly and zero-extend. ++ */ ++#ifdef __BIG_ENDIAN ++ emit(ctx, move, lo(r1), MIPS_R_A0); ++#endif ++ ++ /* === Entry-point for tail calls === */ ++ ++ /* Zero-extend the 32-bit argument */ ++ emit(ctx, move, hi(r1), MIPS_R_ZERO); ++ ++ /* If the eBPF frame pointer was accessed it must be saved */ ++ if (ctx->accessed & BIT(BPF_REG_FP)) ++ clobber_reg64(ctx, fp); ++ ++ /* Compute the stack space needed for callee-saved registers */ ++ saved = hweight32(ctx->clobbered & JIT_CALLEE_REGS) * sizeof(u32); ++ saved = ALIGN(saved, MIPS_STACK_ALIGNMENT); ++ ++ /* Stack space used by eBPF program local data */ ++ locals = ALIGN(ctx->program->aux->stack_depth, MIPS_STACK_ALIGNMENT); ++ ++ /* ++ * If we are emitting function calls, reserve extra stack space for ++ * caller-saved registers and function arguments passed on the stack. ++ * The required space is computed automatically during resource ++ * usage discovery (pass 1). ++ */ ++ reserved = ctx->stack_used; ++ ++ /* Allocate the stack frame */ ++ stack = ALIGN(saved + locals + reserved, MIPS_STACK_ALIGNMENT); ++ emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, -stack); ++ ++ /* Store callee-saved registers on stack */ ++ push_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, stack - saved); ++ ++ /* Initialize the eBPF frame pointer if accessed */ ++ if (ctx->accessed & BIT(BPF_REG_FP)) ++ emit(ctx, addiu, lo(fp), MIPS_R_SP, stack - saved); ++ ++ ctx->saved_size = saved; ++ ctx->stack_size = stack; ++} ++ ++/* Build the program epilogue to restore the stack and registers */ ++void build_epilogue(struct jit_context *ctx, int dest_reg) ++{ ++ /* Restore callee-saved registers from stack */ ++ pop_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, ++ ctx->stack_size - ctx->saved_size); ++ /* ++ * A 32-bit return value is always passed in MIPS register v0, ++ * but on big-endian targets the low part of R0 is mapped to v1. ++ */ ++#ifdef __BIG_ENDIAN ++ emit(ctx, move, MIPS_R_V0, MIPS_R_V1); ++#endif ++ ++ /* Jump to the return address and adjust the stack pointer */ ++ emit(ctx, jr, dest_reg); ++ emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, ctx->stack_size); ++} ++ ++/* Build one eBPF instruction */ ++int build_insn(const struct bpf_insn *insn, struct jit_context *ctx) ++{ ++ const u8 *dst = bpf2mips32[insn->dst_reg]; ++ const u8 *src = bpf2mips32[insn->src_reg]; ++ const u8 *tmp = bpf2mips32[JIT_REG_TMP]; ++ u8 code = insn->code; ++ s16 off = insn->off; ++ s32 imm = insn->imm; ++ s32 val, rel; ++ u8 alu, jmp; ++ ++ switch (code) { ++ /* ALU operations */ ++ /* dst = imm */ ++ case BPF_ALU | BPF_MOV | BPF_K: ++ emit_mov_i(ctx, lo(dst), imm); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = src */ ++ case BPF_ALU | BPF_MOV | BPF_X: ++ if (imm == 1) { ++ /* Special mov32 for zext */ ++ emit_mov_i(ctx, hi(dst), 0); ++ } else { ++ emit_mov_r(ctx, lo(dst), lo(src)); ++ emit_zext_ver(ctx, dst); ++ } ++ break; ++ /* dst = -dst */ ++ case BPF_ALU | BPF_NEG: ++ emit_alu_i(ctx, lo(dst), 0, BPF_NEG); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = dst & imm */ ++ /* dst = dst | imm */ ++ /* dst = dst ^ imm */ ++ /* dst = dst << imm */ ++ /* dst = dst >> imm */ ++ /* dst = dst >> imm (arithmetic) */ ++ /* dst = dst + imm */ ++ /* dst = dst - imm */ ++ /* dst = dst * imm */ ++ /* dst = dst / imm */ ++ /* dst = dst % imm */ ++ case BPF_ALU | BPF_OR | BPF_K: ++ case BPF_ALU | BPF_AND | BPF_K: ++ case BPF_ALU | BPF_XOR | BPF_K: ++ case BPF_ALU | BPF_LSH | BPF_K: ++ case BPF_ALU | BPF_RSH | BPF_K: ++ case BPF_ALU | BPF_ARSH | BPF_K: ++ case BPF_ALU | BPF_ADD | BPF_K: ++ case BPF_ALU | BPF_SUB | BPF_K: ++ case BPF_ALU | BPF_MUL | BPF_K: ++ case BPF_ALU | BPF_DIV | BPF_K: ++ case BPF_ALU | BPF_MOD | BPF_K: ++ if (!valid_alu_i(BPF_OP(code), imm)) { ++ emit_mov_i(ctx, MIPS_R_T6, imm); ++ emit_alu_r(ctx, lo(dst), MIPS_R_T6, BPF_OP(code)); ++ } else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) { ++ emit_alu_i(ctx, lo(dst), val, alu); ++ } ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = dst & src */ ++ /* dst = dst | src */ ++ /* dst = dst ^ src */ ++ /* dst = dst << src */ ++ /* dst = dst >> src */ ++ /* dst = dst >> src (arithmetic) */ ++ /* dst = dst + src */ ++ /* dst = dst - src */ ++ /* dst = dst * src */ ++ /* dst = dst / src */ ++ /* dst = dst % src */ ++ case BPF_ALU | BPF_AND | BPF_X: ++ case BPF_ALU | BPF_OR | BPF_X: ++ case BPF_ALU | BPF_XOR | BPF_X: ++ case BPF_ALU | BPF_LSH | BPF_X: ++ case BPF_ALU | BPF_RSH | BPF_X: ++ case BPF_ALU | BPF_ARSH | BPF_X: ++ case BPF_ALU | BPF_ADD | BPF_X: ++ case BPF_ALU | BPF_SUB | BPF_X: ++ case BPF_ALU | BPF_MUL | BPF_X: ++ case BPF_ALU | BPF_DIV | BPF_X: ++ case BPF_ALU | BPF_MOD | BPF_X: ++ emit_alu_r(ctx, lo(dst), lo(src), BPF_OP(code)); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = imm (64-bit) */ ++ case BPF_ALU64 | BPF_MOV | BPF_K: ++ emit_mov_se_i64(ctx, dst, imm); ++ break; ++ /* dst = src (64-bit) */ ++ case BPF_ALU64 | BPF_MOV | BPF_X: ++ emit_mov_r(ctx, lo(dst), lo(src)); ++ emit_mov_r(ctx, hi(dst), hi(src)); ++ break; ++ /* dst = -dst (64-bit) */ ++ case BPF_ALU64 | BPF_NEG: ++ emit_neg_i64(ctx, dst); ++ break; ++ /* dst = dst & imm (64-bit) */ ++ case BPF_ALU64 | BPF_AND | BPF_K: ++ emit_alu_i64(ctx, dst, imm, BPF_OP(code)); ++ break; ++ /* dst = dst | imm (64-bit) */ ++ /* dst = dst ^ imm (64-bit) */ ++ /* dst = dst + imm (64-bit) */ ++ /* dst = dst - imm (64-bit) */ ++ case BPF_ALU64 | BPF_OR | BPF_K: ++ case BPF_ALU64 | BPF_XOR | BPF_K: ++ case BPF_ALU64 | BPF_ADD | BPF_K: ++ case BPF_ALU64 | BPF_SUB | BPF_K: ++ if (imm) ++ emit_alu_i64(ctx, dst, imm, BPF_OP(code)); ++ break; ++ /* dst = dst << imm (64-bit) */ ++ /* dst = dst >> imm (64-bit) */ ++ /* dst = dst >> imm (64-bit, arithmetic) */ ++ case BPF_ALU64 | BPF_LSH | BPF_K: ++ case BPF_ALU64 | BPF_RSH | BPF_K: ++ case BPF_ALU64 | BPF_ARSH | BPF_K: ++ if (imm) ++ emit_shift_i64(ctx, dst, imm, BPF_OP(code)); ++ break; ++ /* dst = dst * imm (64-bit) */ ++ case BPF_ALU64 | BPF_MUL | BPF_K: ++ emit_mul_i64(ctx, dst, imm); ++ break; ++ /* dst = dst / imm (64-bit) */ ++ /* dst = dst % imm (64-bit) */ ++ case BPF_ALU64 | BPF_DIV | BPF_K: ++ case BPF_ALU64 | BPF_MOD | BPF_K: ++ /* ++ * Sign-extend the immediate value into a temporary register, ++ * and then do the operation on this register. ++ */ ++ emit_mov_se_i64(ctx, tmp, imm); ++ emit_divmod_r64(ctx, dst, tmp, BPF_OP(code)); ++ break; ++ /* dst = dst & src (64-bit) */ ++ /* dst = dst | src (64-bit) */ ++ /* dst = dst ^ src (64-bit) */ ++ /* dst = dst + src (64-bit) */ ++ /* dst = dst - src (64-bit) */ ++ case BPF_ALU64 | BPF_AND | BPF_X: ++ case BPF_ALU64 | BPF_OR | BPF_X: ++ case BPF_ALU64 | BPF_XOR | BPF_X: ++ case BPF_ALU64 | BPF_ADD | BPF_X: ++ case BPF_ALU64 | BPF_SUB | BPF_X: ++ emit_alu_r64(ctx, dst, src, BPF_OP(code)); ++ break; ++ /* dst = dst << src (64-bit) */ ++ /* dst = dst >> src (64-bit) */ ++ /* dst = dst >> src (64-bit, arithmetic) */ ++ case BPF_ALU64 | BPF_LSH | BPF_X: ++ case BPF_ALU64 | BPF_RSH | BPF_X: ++ case BPF_ALU64 | BPF_ARSH | BPF_X: ++ emit_shift_r64(ctx, dst, lo(src), BPF_OP(code)); ++ break; ++ /* dst = dst * src (64-bit) */ ++ case BPF_ALU64 | BPF_MUL | BPF_X: ++ emit_mul_r64(ctx, dst, src); ++ break; ++ /* dst = dst / src (64-bit) */ ++ /* dst = dst % src (64-bit) */ ++ case BPF_ALU64 | BPF_DIV | BPF_X: ++ case BPF_ALU64 | BPF_MOD | BPF_X: ++ emit_divmod_r64(ctx, dst, src, BPF_OP(code)); ++ break; ++ /* dst = htole(dst) */ ++ /* dst = htobe(dst) */ ++ case BPF_ALU | BPF_END | BPF_FROM_LE: ++ case BPF_ALU | BPF_END | BPF_FROM_BE: ++ if (BPF_SRC(code) == ++#ifdef __BIG_ENDIAN ++ BPF_FROM_LE ++#else ++ BPF_FROM_BE ++#endif ++ ) ++ emit_bswap_r64(ctx, dst, imm); ++ else ++ emit_trunc_r64(ctx, dst, imm); ++ break; ++ /* dst = imm64 */ ++ case BPF_LD | BPF_IMM | BPF_DW: ++ emit_mov_i(ctx, lo(dst), imm); ++ emit_mov_i(ctx, hi(dst), insn[1].imm); ++ return 1; ++ /* LDX: dst = *(size *)(src + off) */ ++ case BPF_LDX | BPF_MEM | BPF_W: ++ case BPF_LDX | BPF_MEM | BPF_H: ++ case BPF_LDX | BPF_MEM | BPF_B: ++ case BPF_LDX | BPF_MEM | BPF_DW: ++ emit_ldx(ctx, dst, lo(src), off, BPF_SIZE(code)); ++ break; ++ /* ST: *(size *)(dst + off) = imm */ ++ case BPF_ST | BPF_MEM | BPF_W: ++ case BPF_ST | BPF_MEM | BPF_H: ++ case BPF_ST | BPF_MEM | BPF_B: ++ case BPF_ST | BPF_MEM | BPF_DW: ++ switch (BPF_SIZE(code)) { ++ case BPF_DW: ++ /* Sign-extend immediate value into temporary reg */ ++ emit_mov_se_i64(ctx, tmp, imm); ++ break; ++ case BPF_W: ++ case BPF_H: ++ case BPF_B: ++ emit_mov_i(ctx, lo(tmp), imm); ++ break; ++ } ++ emit_stx(ctx, lo(dst), tmp, off, BPF_SIZE(code)); ++ break; ++ /* STX: *(size *)(dst + off) = src */ ++ case BPF_STX | BPF_MEM | BPF_W: ++ case BPF_STX | BPF_MEM | BPF_H: ++ case BPF_STX | BPF_MEM | BPF_B: ++ case BPF_STX | BPF_MEM | BPF_DW: ++ emit_stx(ctx, lo(dst), src, off, BPF_SIZE(code)); ++ break; ++ /* Speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ /* Atomics */ ++ case BPF_STX | BPF_XADD | BPF_W: ++ switch (imm) { ++ case BPF_ADD: ++ case BPF_AND: ++ case BPF_OR: ++ case BPF_XOR: ++ if (cpu_has_llsc) ++ emit_atomic_r(ctx, lo(dst), lo(src), off, imm); ++ else /* Non-ll/sc fallback */ ++ emit_atomic_r32(ctx, lo(dst), lo(src), ++ off, imm); ++ break; ++ default: ++ goto notyet; ++ } ++ break; ++ /* Atomics (64-bit) */ ++ case BPF_STX | BPF_XADD | BPF_DW: ++ switch (imm) { ++ case BPF_ADD: ++ case BPF_AND: ++ case BPF_OR: ++ case BPF_XOR: ++ emit_atomic_r64(ctx, lo(dst), src, off, imm); ++ break; ++ default: ++ goto notyet; ++ } ++ break; ++ /* PC += off if dst == src */ ++ /* PC += off if dst != src */ ++ /* PC += off if dst & src */ ++ /* PC += off if dst > src */ ++ /* PC += off if dst >= src */ ++ /* PC += off if dst < src */ ++ /* PC += off if dst <= src */ ++ /* PC += off if dst > src (signed) */ ++ /* PC += off if dst >= src (signed) */ ++ /* PC += off if dst < src (signed) */ ++ /* PC += off if dst <= src (signed) */ ++ case BPF_JMP32 | BPF_JEQ | BPF_X: ++ case BPF_JMP32 | BPF_JNE | BPF_X: ++ case BPF_JMP32 | BPF_JSET | BPF_X: ++ case BPF_JMP32 | BPF_JGT | BPF_X: ++ case BPF_JMP32 | BPF_JGE | BPF_X: ++ case BPF_JMP32 | BPF_JLT | BPF_X: ++ case BPF_JMP32 | BPF_JLE | BPF_X: ++ case BPF_JMP32 | BPF_JSGT | BPF_X: ++ case BPF_JMP32 | BPF_JSGE | BPF_X: ++ case BPF_JMP32 | BPF_JSLT | BPF_X: ++ case BPF_JMP32 | BPF_JSLE | BPF_X: ++ if (off == 0) ++ break; ++ setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel); ++ emit_jmp_r(ctx, lo(dst), lo(src), rel, jmp); ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off if dst == imm */ ++ /* PC += off if dst != imm */ ++ /* PC += off if dst & imm */ ++ /* PC += off if dst > imm */ ++ /* PC += off if dst >= imm */ ++ /* PC += off if dst < imm */ ++ /* PC += off if dst <= imm */ ++ /* PC += off if dst > imm (signed) */ ++ /* PC += off if dst >= imm (signed) */ ++ /* PC += off if dst < imm (signed) */ ++ /* PC += off if dst <= imm (signed) */ ++ case BPF_JMP32 | BPF_JEQ | BPF_K: ++ case BPF_JMP32 | BPF_JNE | BPF_K: ++ case BPF_JMP32 | BPF_JSET | BPF_K: ++ case BPF_JMP32 | BPF_JGT | BPF_K: ++ case BPF_JMP32 | BPF_JGE | BPF_K: ++ case BPF_JMP32 | BPF_JLT | BPF_K: ++ case BPF_JMP32 | BPF_JLE | BPF_K: ++ case BPF_JMP32 | BPF_JSGT | BPF_K: ++ case BPF_JMP32 | BPF_JSGE | BPF_K: ++ case BPF_JMP32 | BPF_JSLT | BPF_K: ++ case BPF_JMP32 | BPF_JSLE | BPF_K: ++ if (off == 0) ++ break; ++ setup_jmp_i(ctx, imm, 32, BPF_OP(code), off, &jmp, &rel); ++ if (valid_jmp_i(jmp, imm)) { ++ emit_jmp_i(ctx, lo(dst), imm, rel, jmp); ++ } else { ++ /* Move large immediate to register */ ++ emit_mov_i(ctx, MIPS_R_T6, imm); ++ emit_jmp_r(ctx, lo(dst), MIPS_R_T6, rel, jmp); ++ } ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off if dst == src */ ++ /* PC += off if dst != src */ ++ /* PC += off if dst & src */ ++ /* PC += off if dst > src */ ++ /* PC += off if dst >= src */ ++ /* PC += off if dst < src */ ++ /* PC += off if dst <= src */ ++ /* PC += off if dst > src (signed) */ ++ /* PC += off if dst >= src (signed) */ ++ /* PC += off if dst < src (signed) */ ++ /* PC += off if dst <= src (signed) */ ++ case BPF_JMP | BPF_JEQ | BPF_X: ++ case BPF_JMP | BPF_JNE | BPF_X: ++ case BPF_JMP | BPF_JSET | BPF_X: ++ case BPF_JMP | BPF_JGT | BPF_X: ++ case BPF_JMP | BPF_JGE | BPF_X: ++ case BPF_JMP | BPF_JLT | BPF_X: ++ case BPF_JMP | BPF_JLE | BPF_X: ++ case BPF_JMP | BPF_JSGT | BPF_X: ++ case BPF_JMP | BPF_JSGE | BPF_X: ++ case BPF_JMP | BPF_JSLT | BPF_X: ++ case BPF_JMP | BPF_JSLE | BPF_X: ++ if (off == 0) ++ break; ++ setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel); ++ emit_jmp_r64(ctx, dst, src, rel, jmp); ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off if dst == imm */ ++ /* PC += off if dst != imm */ ++ /* PC += off if dst & imm */ ++ /* PC += off if dst > imm */ ++ /* PC += off if dst >= imm */ ++ /* PC += off if dst < imm */ ++ /* PC += off if dst <= imm */ ++ /* PC += off if dst > imm (signed) */ ++ /* PC += off if dst >= imm (signed) */ ++ /* PC += off if dst < imm (signed) */ ++ /* PC += off if dst <= imm (signed) */ ++ case BPF_JMP | BPF_JEQ | BPF_K: ++ case BPF_JMP | BPF_JNE | BPF_K: ++ case BPF_JMP | BPF_JSET | BPF_K: ++ case BPF_JMP | BPF_JGT | BPF_K: ++ case BPF_JMP | BPF_JGE | BPF_K: ++ case BPF_JMP | BPF_JLT | BPF_K: ++ case BPF_JMP | BPF_JLE | BPF_K: ++ case BPF_JMP | BPF_JSGT | BPF_K: ++ case BPF_JMP | BPF_JSGE | BPF_K: ++ case BPF_JMP | BPF_JSLT | BPF_K: ++ case BPF_JMP | BPF_JSLE | BPF_K: ++ if (off == 0) ++ break; ++ setup_jmp_i(ctx, imm, 64, BPF_OP(code), off, &jmp, &rel); ++ emit_jmp_i64(ctx, dst, imm, rel, jmp); ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off */ ++ case BPF_JMP | BPF_JA: ++ if (off == 0) ++ break; ++ if (emit_ja(ctx, off) < 0) ++ goto toofar; ++ break; ++ /* Tail call */ ++ case BPF_JMP | BPF_TAIL_CALL: ++ if (emit_tail_call(ctx) < 0) ++ goto invalid; ++ break; ++ /* Function call */ ++ case BPF_JMP | BPF_CALL: ++ if (emit_call(ctx, insn) < 0) ++ goto invalid; ++ break; ++ /* Function return */ ++ case BPF_JMP | BPF_EXIT: ++ /* ++ * Optimization: when last instruction is EXIT ++ * simply continue to epilogue. ++ */ ++ if (ctx->bpf_index == ctx->program->len - 1) ++ break; ++ if (emit_exit(ctx) < 0) ++ goto toofar; ++ break; ++ ++ default: ++invalid: ++ pr_err_once("unknown opcode %02x\n", code); ++ return -EINVAL; ++notyet: ++ pr_info_once("*** NOT YET: opcode %02x ***\n", code); ++ return -EFAULT; ++toofar: ++ pr_info_once("*** TOO FAR: jump at %u opcode %02x ***\n", ++ ctx->bpf_index, code); ++ return -E2BIG; ++ } ++ return 0; ++} diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-03-mips-bpf-Add-new-eBPF-JIT-for-64-bit-MIPS.patch b/root/target/linux/generic/backport-5.15/050-v5.16-03-mips-bpf-Add-new-eBPF-JIT-for-64-bit-MIPS.patch new file mode 100755 index 00000000..38b46c0b --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-03-mips-bpf-Add-new-eBPF-JIT-for-64-bit-MIPS.patch @@ -0,0 +1,1005 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:05 +0200 +Subject: [PATCH] mips: bpf: Add new eBPF JIT for 64-bit MIPS + +This is an implementation on of an eBPF JIT for 64-bit MIPS III-V and +MIPS64r1-r6. It uses the same framework introduced by the 32-bit JIT. + +Signed-off-by: Johan Almbladh +--- + create mode 100644 arch/mips/net/bpf_jit_comp64.c + +--- /dev/null ++++ b/arch/mips/net/bpf_jit_comp64.c +@@ -0,0 +1,991 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Just-In-Time compiler for eBPF bytecode on MIPS. ++ * Implementation of JIT functions for 64-bit CPUs. ++ * ++ * Copyright (c) 2021 Anyfi Networks AB. ++ * Author: Johan Almbladh ++ * ++ * Based on code and ideas from ++ * Copyright (c) 2017 Cavium, Inc. ++ * Copyright (c) 2017 Shubham Bansal ++ * Copyright (c) 2011 Mircea Gherzan ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bpf_jit_comp.h" ++ ++/* MIPS t0-t3 are not available in the n64 ABI */ ++#undef MIPS_R_T0 ++#undef MIPS_R_T1 ++#undef MIPS_R_T2 ++#undef MIPS_R_T3 ++ ++/* Stack is 16-byte aligned in n64 ABI */ ++#define MIPS_STACK_ALIGNMENT 16 ++ ++/* Extra 64-bit eBPF registers used by JIT */ ++#define JIT_REG_TC (MAX_BPF_JIT_REG + 0) ++#define JIT_REG_ZX (MAX_BPF_JIT_REG + 1) ++ ++/* Number of prologue bytes to skip when doing a tail call */ ++#define JIT_TCALL_SKIP 4 ++ ++/* Callee-saved CPU registers that the JIT must preserve */ ++#define JIT_CALLEE_REGS \ ++ (BIT(MIPS_R_S0) | \ ++ BIT(MIPS_R_S1) | \ ++ BIT(MIPS_R_S2) | \ ++ BIT(MIPS_R_S3) | \ ++ BIT(MIPS_R_S4) | \ ++ BIT(MIPS_R_S5) | \ ++ BIT(MIPS_R_S6) | \ ++ BIT(MIPS_R_S7) | \ ++ BIT(MIPS_R_GP) | \ ++ BIT(MIPS_R_FP) | \ ++ BIT(MIPS_R_RA)) ++ ++/* Caller-saved CPU registers available for JIT use */ ++#define JIT_CALLER_REGS \ ++ (BIT(MIPS_R_A5) | \ ++ BIT(MIPS_R_A6) | \ ++ BIT(MIPS_R_A7)) ++/* ++ * Mapping of 64-bit eBPF registers to 64-bit native MIPS registers. ++ * MIPS registers t4 - t7 may be used by the JIT as temporary registers. ++ * MIPS registers t8 - t9 are reserved for single-register common functions. ++ */ ++static const u8 bpf2mips64[] = { ++ /* Return value from in-kernel function, and exit value from eBPF */ ++ [BPF_REG_0] = MIPS_R_V0, ++ /* Arguments from eBPF program to in-kernel function */ ++ [BPF_REG_1] = MIPS_R_A0, ++ [BPF_REG_2] = MIPS_R_A1, ++ [BPF_REG_3] = MIPS_R_A2, ++ [BPF_REG_4] = MIPS_R_A3, ++ [BPF_REG_5] = MIPS_R_A4, ++ /* Callee-saved registers that in-kernel function will preserve */ ++ [BPF_REG_6] = MIPS_R_S0, ++ [BPF_REG_7] = MIPS_R_S1, ++ [BPF_REG_8] = MIPS_R_S2, ++ [BPF_REG_9] = MIPS_R_S3, ++ /* Read-only frame pointer to access the eBPF stack */ ++ [BPF_REG_FP] = MIPS_R_FP, ++ /* Temporary register for blinding constants */ ++ [BPF_REG_AX] = MIPS_R_AT, ++ /* Tail call count register, caller-saved */ ++ [JIT_REG_TC] = MIPS_R_A5, ++ /* Constant for register zero-extension */ ++ [JIT_REG_ZX] = MIPS_R_V1, ++}; ++ ++/* ++ * MIPS 32-bit operations on 64-bit registers generate a sign-extended ++ * result. However, the eBPF ISA mandates zero-extension, so we rely on the ++ * verifier to add that for us (emit_zext_ver). In addition, ALU arithmetic ++ * operations, right shift and byte swap require properly sign-extended ++ * operands or the result is unpredictable. We emit explicit sign-extensions ++ * in those cases. ++ */ ++ ++/* Sign extension */ ++static void emit_sext(struct jit_context *ctx, u8 dst, u8 src) ++{ ++ emit(ctx, sll, dst, src, 0); ++ clobber_reg(ctx, dst); ++} ++ ++/* Zero extension */ ++static void emit_zext(struct jit_context *ctx, u8 dst) ++{ ++ if (cpu_has_mips64r2 || cpu_has_mips64r6) { ++ emit(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); ++ } else { ++ emit(ctx, and, dst, dst, bpf2mips64[JIT_REG_ZX]); ++ access_reg(ctx, JIT_REG_ZX); /* We need the ZX register */ ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Zero extension, if verifier does not do it for us */ ++static void emit_zext_ver(struct jit_context *ctx, u8 dst) ++{ ++ if (!ctx->program->aux->verifier_zext) ++ emit_zext(ctx, dst); ++} ++ ++/* dst = imm (64-bit) */ ++static void emit_mov_i64(struct jit_context *ctx, u8 dst, u64 imm64) ++{ ++ if (imm64 >= 0xffffffffffff8000ULL || imm64 < 0x8000ULL) { ++ emit(ctx, daddiu, dst, MIPS_R_ZERO, (s16)imm64); ++ } else if (imm64 >= 0xffffffff80000000ULL || ++ (imm64 < 0x80000000 && imm64 > 0xffff)) { ++ emit(ctx, lui, dst, (s16)(imm64 >> 16)); ++ emit(ctx, ori, dst, dst, (u16)imm64 & 0xffff); ++ } else { ++ u8 acc = MIPS_R_ZERO; ++ int k; ++ ++ for (k = 0; k < 4; k++) { ++ u16 half = imm64 >> (48 - 16 * k); ++ ++ if (acc == dst) ++ emit(ctx, dsll, dst, dst, 16); ++ ++ if (half) { ++ emit(ctx, ori, dst, acc, half); ++ acc = dst; ++ } ++ } ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* ALU immediate operation (64-bit) */ ++static void emit_alu_i64(struct jit_context *ctx, u8 dst, s32 imm, u8 op) ++{ ++ switch (BPF_OP(op)) { ++ /* dst = dst | imm */ ++ case BPF_OR: ++ emit(ctx, ori, dst, dst, (u16)imm); ++ break; ++ /* dst = dst ^ imm */ ++ case BPF_XOR: ++ emit(ctx, xori, dst, dst, (u16)imm); ++ break; ++ /* dst = -dst */ ++ case BPF_NEG: ++ emit(ctx, dsubu, dst, MIPS_R_ZERO, dst); ++ break; ++ /* dst = dst << imm */ ++ case BPF_LSH: ++ emit(ctx, dsll_safe, dst, dst, imm); ++ break; ++ /* dst = dst >> imm */ ++ case BPF_RSH: ++ emit(ctx, dsrl_safe, dst, dst, imm); ++ break; ++ /* dst = dst >> imm (arithmetic) */ ++ case BPF_ARSH: ++ emit(ctx, dsra_safe, dst, dst, imm); ++ break; ++ /* dst = dst + imm */ ++ case BPF_ADD: ++ emit(ctx, daddiu, dst, dst, imm); ++ break; ++ /* dst = dst - imm */ ++ case BPF_SUB: ++ emit(ctx, daddiu, dst, dst, -imm); ++ break; ++ default: ++ /* Width-generic operations */ ++ emit_alu_i(ctx, dst, imm, op); ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* ALU register operation (64-bit) */ ++static void emit_alu_r64(struct jit_context *ctx, u8 dst, u8 src, u8 op) ++{ ++ switch (BPF_OP(op)) { ++ /* dst = dst << src */ ++ case BPF_LSH: ++ emit(ctx, dsllv, dst, dst, src); ++ break; ++ /* dst = dst >> src */ ++ case BPF_RSH: ++ emit(ctx, dsrlv, dst, dst, src); ++ break; ++ /* dst = dst >> src (arithmetic) */ ++ case BPF_ARSH: ++ emit(ctx, dsrav, dst, dst, src); ++ break; ++ /* dst = dst + src */ ++ case BPF_ADD: ++ emit(ctx, daddu, dst, dst, src); ++ break; ++ /* dst = dst - src */ ++ case BPF_SUB: ++ emit(ctx, dsubu, dst, dst, src); ++ break; ++ /* dst = dst * src */ ++ case BPF_MUL: ++ if (cpu_has_mips64r6) { ++ emit(ctx, dmulu, dst, dst, src); ++ } else { ++ emit(ctx, dmultu, dst, src); ++ emit(ctx, mflo, dst); ++ } ++ break; ++ /* dst = dst / src */ ++ case BPF_DIV: ++ if (cpu_has_mips64r6) { ++ emit(ctx, ddivu_r6, dst, dst, src); ++ } else { ++ emit(ctx, ddivu, dst, src); ++ emit(ctx, mflo, dst); ++ } ++ break; ++ /* dst = dst % src */ ++ case BPF_MOD: ++ if (cpu_has_mips64r6) { ++ emit(ctx, dmodu, dst, dst, src); ++ } else { ++ emit(ctx, ddivu, dst, src); ++ emit(ctx, mfhi, dst); ++ } ++ break; ++ default: ++ /* Width-generic operations */ ++ emit_alu_r(ctx, dst, src, op); ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Swap sub words in a register double word */ ++static void emit_swap_r64(struct jit_context *ctx, u8 dst, u8 mask, u32 bits) ++{ ++ u8 tmp = MIPS_R_T9; ++ ++ emit(ctx, and, tmp, dst, mask); /* tmp = dst & mask */ ++ emit(ctx, dsll, tmp, tmp, bits); /* tmp = tmp << bits */ ++ emit(ctx, dsrl, dst, dst, bits); /* dst = dst >> bits */ ++ emit(ctx, and, dst, dst, mask); /* dst = dst & mask */ ++ emit(ctx, or, dst, dst, tmp); /* dst = dst | tmp */ ++} ++ ++/* Swap bytes and truncate a register double word, word or half word */ ++static void emit_bswap_r64(struct jit_context *ctx, u8 dst, u32 width) ++{ ++ switch (width) { ++ /* Swap bytes in a double word */ ++ case 64: ++ if (cpu_has_mips64r2 || cpu_has_mips64r6) { ++ emit(ctx, dsbh, dst, dst); ++ emit(ctx, dshd, dst, dst); ++ } else { ++ u8 t1 = MIPS_R_T6; ++ u8 t2 = MIPS_R_T7; ++ ++ emit(ctx, dsll32, t2, dst, 0); /* t2 = dst << 32 */ ++ emit(ctx, dsrl32, dst, dst, 0); /* dst = dst >> 32 */ ++ emit(ctx, or, dst, dst, t2); /* dst = dst | t2 */ ++ ++ emit(ctx, ori, t2, MIPS_R_ZERO, 0xffff); ++ emit(ctx, dsll32, t1, t2, 0); /* t1 = t2 << 32 */ ++ emit(ctx, or, t1, t1, t2); /* t1 = t1 | t2 */ ++ emit_swap_r64(ctx, dst, t1, 16);/* dst = swap16(dst) */ ++ ++ emit(ctx, lui, t2, 0xff); /* t2 = 0x00ff0000 */ ++ emit(ctx, ori, t2, t2, 0xff); /* t2 = t2 | 0x00ff */ ++ emit(ctx, dsll32, t1, t2, 0); /* t1 = t2 << 32 */ ++ emit(ctx, or, t1, t1, t2); /* t1 = t1 | t2 */ ++ emit_swap_r64(ctx, dst, t1, 8); /* dst = swap8(dst) */ ++ } ++ break; ++ /* Swap bytes in a half word */ ++ /* Swap bytes in a word */ ++ case 32: ++ case 16: ++ emit_sext(ctx, dst, dst); ++ emit_bswap_r(ctx, dst, width); ++ if (cpu_has_mips64r2 || cpu_has_mips64r6) ++ emit_zext(ctx, dst); ++ break; ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Truncate a register double word, word or half word */ ++static void emit_trunc_r64(struct jit_context *ctx, u8 dst, u32 width) ++{ ++ switch (width) { ++ case 64: ++ break; ++ /* Zero-extend a word */ ++ case 32: ++ emit_zext(ctx, dst); ++ break; ++ /* Zero-extend a half word */ ++ case 16: ++ emit(ctx, andi, dst, dst, 0xffff); ++ break; ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Load operation: dst = *(size*)(src + off) */ ++static void emit_ldx(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 size) ++{ ++ switch (size) { ++ /* Load a byte */ ++ case BPF_B: ++ emit(ctx, lbu, dst, off, src); ++ break; ++ /* Load a half word */ ++ case BPF_H: ++ emit(ctx, lhu, dst, off, src); ++ break; ++ /* Load a word */ ++ case BPF_W: ++ emit(ctx, lwu, dst, off, src); ++ break; ++ /* Load a double word */ ++ case BPF_DW: ++ emit(ctx, ld, dst, off, src); ++ break; ++ } ++ clobber_reg(ctx, dst); ++} ++ ++/* Store operation: *(size *)(dst + off) = src */ ++static void emit_stx(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 size) ++{ ++ switch (size) { ++ /* Store a byte */ ++ case BPF_B: ++ emit(ctx, sb, src, off, dst); ++ break; ++ /* Store a half word */ ++ case BPF_H: ++ emit(ctx, sh, src, off, dst); ++ break; ++ /* Store a word */ ++ case BPF_W: ++ emit(ctx, sw, src, off, dst); ++ break; ++ /* Store a double word */ ++ case BPF_DW: ++ emit(ctx, sd, src, off, dst); ++ break; ++ } ++} ++ ++/* Atomic read-modify-write */ ++static void emit_atomic_r64(struct jit_context *ctx, ++ u8 dst, u8 src, s16 off, u8 code) ++{ ++ u8 t1 = MIPS_R_T6; ++ u8 t2 = MIPS_R_T7; ++ ++ emit(ctx, lld, t1, off, dst); ++ switch (code) { ++ case BPF_ADD: ++ emit(ctx, daddu, t2, t1, src); ++ break; ++ case BPF_AND: ++ emit(ctx, and, t2, t1, src); ++ break; ++ case BPF_OR: ++ emit(ctx, or, t2, t1, src); ++ break; ++ case BPF_XOR: ++ emit(ctx, xor, t2, t1, src); ++ break; ++ } ++ emit(ctx, scd, t2, off, dst); ++ emit(ctx, beqz, t2, -16); ++ emit(ctx, nop); /* Delay slot */ ++} ++ ++/* Function call */ ++static int emit_call(struct jit_context *ctx, const struct bpf_insn *insn) ++{ ++ u8 zx = bpf2mips64[JIT_REG_ZX]; ++ u8 tmp = MIPS_R_T6; ++ bool fixed; ++ u64 addr; ++ ++ /* Decode the call address */ ++ if (bpf_jit_get_func_addr(ctx->program, insn, false, ++ &addr, &fixed) < 0) ++ return -1; ++ if (!fixed) ++ return -1; ++ ++ /* Push caller-saved registers on stack */ ++ push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0); ++ ++ /* Emit function call */ ++ emit_mov_i64(ctx, tmp, addr); ++ emit(ctx, jalr, MIPS_R_RA, tmp); ++ emit(ctx, nop); /* Delay slot */ ++ ++ /* Restore caller-saved registers */ ++ pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0); ++ ++ /* Re-initialize the JIT zero-extension register if accessed */ ++ if (ctx->accessed & BIT(JIT_REG_ZX)) { ++ emit(ctx, daddiu, zx, MIPS_R_ZERO, -1); ++ emit(ctx, dsrl32, zx, zx, 0); ++ } ++ ++ clobber_reg(ctx, MIPS_R_RA); ++ clobber_reg(ctx, MIPS_R_V0); ++ clobber_reg(ctx, MIPS_R_V1); ++ return 0; ++} ++ ++/* Function tail call */ ++static int emit_tail_call(struct jit_context *ctx) ++{ ++ u8 ary = bpf2mips64[BPF_REG_2]; ++ u8 ind = bpf2mips64[BPF_REG_3]; ++ u8 tcc = bpf2mips64[JIT_REG_TC]; ++ u8 tmp = MIPS_R_T6; ++ int off; ++ ++ /* ++ * Tail call: ++ * eBPF R1 - function argument (context ptr), passed in a0-a1 ++ * eBPF R2 - ptr to object with array of function entry points ++ * eBPF R3 - array index of function to be called ++ */ ++ ++ /* if (ind >= ary->map.max_entries) goto out */ ++ off = offsetof(struct bpf_array, map.max_entries); ++ if (off > 0x7fff) ++ return -1; ++ emit(ctx, lwu, tmp, off, ary); /* tmp = ary->map.max_entrs*/ ++ emit(ctx, sltu, tmp, ind, tmp); /* tmp = ind < t1 */ ++ emit(ctx, beqz, tmp, get_offset(ctx, 1)); /* PC += off(1) if tmp == 0*/ ++ ++ /* if (--TCC < 0) goto out */ ++ emit(ctx, daddiu, tcc, tcc, -1); /* tcc-- (delay slot) */ ++ emit(ctx, bltz, tcc, get_offset(ctx, 1)); /* PC += off(1) if tcc < 0 */ ++ /* (next insn delay slot) */ ++ /* prog = ary->ptrs[ind] */ ++ off = offsetof(struct bpf_array, ptrs); ++ if (off > 0x7fff) ++ return -1; ++ emit(ctx, dsll, tmp, ind, 3); /* tmp = ind << 3 */ ++ emit(ctx, daddu, tmp, tmp, ary); /* tmp += ary */ ++ emit(ctx, ld, tmp, off, tmp); /* tmp = *(tmp + off) */ ++ ++ /* if (prog == 0) goto out */ ++ emit(ctx, beqz, tmp, get_offset(ctx, 1)); /* PC += off(1) if tmp == 0*/ ++ emit(ctx, nop); /* Delay slot */ ++ ++ /* func = prog->bpf_func + 8 (prologue skip offset) */ ++ off = offsetof(struct bpf_prog, bpf_func); ++ if (off > 0x7fff) ++ return -1; ++ emit(ctx, ld, tmp, off, tmp); /* tmp = *(tmp + off) */ ++ emit(ctx, daddiu, tmp, tmp, JIT_TCALL_SKIP); /* tmp += skip (4) */ ++ ++ /* goto func */ ++ build_epilogue(ctx, tmp); ++ access_reg(ctx, JIT_REG_TC); ++ return 0; ++} ++ ++/* ++ * Stack frame layout for a JITed program (stack grows down). ++ * ++ * Higher address : Previous stack frame : ++ * +===========================+ <--- MIPS sp before call ++ * | Callee-saved registers, | ++ * | including RA and FP | ++ * +---------------------------+ <--- eBPF FP (MIPS fp) ++ * | Local eBPF variables | ++ * | allocated by program | ++ * +---------------------------+ ++ * | Reserved for caller-saved | ++ * | registers | ++ * Lower address +===========================+ <--- MIPS sp ++ */ ++ ++/* Build program prologue to set up the stack and registers */ ++void build_prologue(struct jit_context *ctx) ++{ ++ u8 fp = bpf2mips64[BPF_REG_FP]; ++ u8 tc = bpf2mips64[JIT_REG_TC]; ++ u8 zx = bpf2mips64[JIT_REG_ZX]; ++ int stack, saved, locals, reserved; ++ ++ /* ++ * The first instruction initializes the tail call count register. ++ * On a tail call, the calling function jumps into the prologue ++ * after this instruction. ++ */ ++ emit(ctx, addiu, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT + 1, 0xffff)); ++ ++ /* === Entry-point for tail calls === */ ++ ++ /* ++ * If the eBPF frame pointer and tail call count registers were ++ * accessed they must be preserved. Mark them as clobbered here ++ * to save and restore them on the stack as needed. ++ */ ++ if (ctx->accessed & BIT(BPF_REG_FP)) ++ clobber_reg(ctx, fp); ++ if (ctx->accessed & BIT(JIT_REG_TC)) ++ clobber_reg(ctx, tc); ++ if (ctx->accessed & BIT(JIT_REG_ZX)) ++ clobber_reg(ctx, zx); ++ ++ /* Compute the stack space needed for callee-saved registers */ ++ saved = hweight32(ctx->clobbered & JIT_CALLEE_REGS) * sizeof(u64); ++ saved = ALIGN(saved, MIPS_STACK_ALIGNMENT); ++ ++ /* Stack space used by eBPF program local data */ ++ locals = ALIGN(ctx->program->aux->stack_depth, MIPS_STACK_ALIGNMENT); ++ ++ /* ++ * If we are emitting function calls, reserve extra stack space for ++ * caller-saved registers needed by the JIT. The required space is ++ * computed automatically during resource usage discovery (pass 1). ++ */ ++ reserved = ctx->stack_used; ++ ++ /* Allocate the stack frame */ ++ stack = ALIGN(saved + locals + reserved, MIPS_STACK_ALIGNMENT); ++ if (stack) ++ emit(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, -stack); ++ ++ /* Store callee-saved registers on stack */ ++ push_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, stack - saved); ++ ++ /* Initialize the eBPF frame pointer if accessed */ ++ if (ctx->accessed & BIT(BPF_REG_FP)) ++ emit(ctx, daddiu, fp, MIPS_R_SP, stack - saved); ++ ++ /* Initialize the ePF JIT zero-extension register if accessed */ ++ if (ctx->accessed & BIT(JIT_REG_ZX)) { ++ emit(ctx, daddiu, zx, MIPS_R_ZERO, -1); ++ emit(ctx, dsrl32, zx, zx, 0); ++ } ++ ++ ctx->saved_size = saved; ++ ctx->stack_size = stack; ++} ++ ++/* Build the program epilogue to restore the stack and registers */ ++void build_epilogue(struct jit_context *ctx, int dest_reg) ++{ ++ /* Restore callee-saved registers from stack */ ++ pop_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, ++ ctx->stack_size - ctx->saved_size); ++ ++ /* Release the stack frame */ ++ if (ctx->stack_size) ++ emit(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, ctx->stack_size); ++ ++ /* Jump to return address and sign-extend the 32-bit return value */ ++ emit(ctx, jr, dest_reg); ++ emit(ctx, sll, MIPS_R_V0, MIPS_R_V0, 0); /* Delay slot */ ++} ++ ++/* Build one eBPF instruction */ ++int build_insn(const struct bpf_insn *insn, struct jit_context *ctx) ++{ ++ u8 dst = bpf2mips64[insn->dst_reg]; ++ u8 src = bpf2mips64[insn->src_reg]; ++ u8 code = insn->code; ++ s16 off = insn->off; ++ s32 imm = insn->imm; ++ s32 val, rel; ++ u8 alu, jmp; ++ ++ switch (code) { ++ /* ALU operations */ ++ /* dst = imm */ ++ case BPF_ALU | BPF_MOV | BPF_K: ++ emit_mov_i(ctx, dst, imm); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = src */ ++ case BPF_ALU | BPF_MOV | BPF_X: ++ if (imm == 1) { ++ /* Special mov32 for zext */ ++ emit_zext(ctx, dst); ++ } else { ++ emit_mov_r(ctx, dst, src); ++ emit_zext_ver(ctx, dst); ++ } ++ break; ++ /* dst = -dst */ ++ case BPF_ALU | BPF_NEG: ++ emit_sext(ctx, dst, dst); ++ emit_alu_i(ctx, dst, 0, BPF_NEG); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = dst & imm */ ++ /* dst = dst | imm */ ++ /* dst = dst ^ imm */ ++ /* dst = dst << imm */ ++ case BPF_ALU | BPF_OR | BPF_K: ++ case BPF_ALU | BPF_AND | BPF_K: ++ case BPF_ALU | BPF_XOR | BPF_K: ++ case BPF_ALU | BPF_LSH | BPF_K: ++ if (!valid_alu_i(BPF_OP(code), imm)) { ++ emit_mov_i(ctx, MIPS_R_T4, imm); ++ emit_alu_r(ctx, dst, MIPS_R_T4, BPF_OP(code)); ++ } else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) { ++ emit_alu_i(ctx, dst, val, alu); ++ } ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = dst >> imm */ ++ /* dst = dst >> imm (arithmetic) */ ++ /* dst = dst + imm */ ++ /* dst = dst - imm */ ++ /* dst = dst * imm */ ++ /* dst = dst / imm */ ++ /* dst = dst % imm */ ++ case BPF_ALU | BPF_RSH | BPF_K: ++ case BPF_ALU | BPF_ARSH | BPF_K: ++ case BPF_ALU | BPF_ADD | BPF_K: ++ case BPF_ALU | BPF_SUB | BPF_K: ++ case BPF_ALU | BPF_MUL | BPF_K: ++ case BPF_ALU | BPF_DIV | BPF_K: ++ case BPF_ALU | BPF_MOD | BPF_K: ++ if (!valid_alu_i(BPF_OP(code), imm)) { ++ emit_sext(ctx, dst, dst); ++ emit_mov_i(ctx, MIPS_R_T4, imm); ++ emit_alu_r(ctx, dst, MIPS_R_T4, BPF_OP(code)); ++ } else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) { ++ emit_sext(ctx, dst, dst); ++ emit_alu_i(ctx, dst, val, alu); ++ } ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = dst & src */ ++ /* dst = dst | src */ ++ /* dst = dst ^ src */ ++ /* dst = dst << src */ ++ case BPF_ALU | BPF_AND | BPF_X: ++ case BPF_ALU | BPF_OR | BPF_X: ++ case BPF_ALU | BPF_XOR | BPF_X: ++ case BPF_ALU | BPF_LSH | BPF_X: ++ emit_alu_r(ctx, dst, src, BPF_OP(code)); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = dst >> src */ ++ /* dst = dst >> src (arithmetic) */ ++ /* dst = dst + src */ ++ /* dst = dst - src */ ++ /* dst = dst * src */ ++ /* dst = dst / src */ ++ /* dst = dst % src */ ++ case BPF_ALU | BPF_RSH | BPF_X: ++ case BPF_ALU | BPF_ARSH | BPF_X: ++ case BPF_ALU | BPF_ADD | BPF_X: ++ case BPF_ALU | BPF_SUB | BPF_X: ++ case BPF_ALU | BPF_MUL | BPF_X: ++ case BPF_ALU | BPF_DIV | BPF_X: ++ case BPF_ALU | BPF_MOD | BPF_X: ++ emit_sext(ctx, dst, dst); ++ emit_sext(ctx, MIPS_R_T4, src); ++ emit_alu_r(ctx, dst, MIPS_R_T4, BPF_OP(code)); ++ emit_zext_ver(ctx, dst); ++ break; ++ /* dst = imm (64-bit) */ ++ case BPF_ALU64 | BPF_MOV | BPF_K: ++ emit_mov_i(ctx, dst, imm); ++ break; ++ /* dst = src (64-bit) */ ++ case BPF_ALU64 | BPF_MOV | BPF_X: ++ emit_mov_r(ctx, dst, src); ++ break; ++ /* dst = -dst (64-bit) */ ++ case BPF_ALU64 | BPF_NEG: ++ emit_alu_i64(ctx, dst, 0, BPF_NEG); ++ break; ++ /* dst = dst & imm (64-bit) */ ++ /* dst = dst | imm (64-bit) */ ++ /* dst = dst ^ imm (64-bit) */ ++ /* dst = dst << imm (64-bit) */ ++ /* dst = dst >> imm (64-bit) */ ++ /* dst = dst >> imm ((64-bit, arithmetic) */ ++ /* dst = dst + imm (64-bit) */ ++ /* dst = dst - imm (64-bit) */ ++ /* dst = dst * imm (64-bit) */ ++ /* dst = dst / imm (64-bit) */ ++ /* dst = dst % imm (64-bit) */ ++ case BPF_ALU64 | BPF_AND | BPF_K: ++ case BPF_ALU64 | BPF_OR | BPF_K: ++ case BPF_ALU64 | BPF_XOR | BPF_K: ++ case BPF_ALU64 | BPF_LSH | BPF_K: ++ case BPF_ALU64 | BPF_RSH | BPF_K: ++ case BPF_ALU64 | BPF_ARSH | BPF_K: ++ case BPF_ALU64 | BPF_ADD | BPF_K: ++ case BPF_ALU64 | BPF_SUB | BPF_K: ++ case BPF_ALU64 | BPF_MUL | BPF_K: ++ case BPF_ALU64 | BPF_DIV | BPF_K: ++ case BPF_ALU64 | BPF_MOD | BPF_K: ++ if (!valid_alu_i(BPF_OP(code), imm)) { ++ emit_mov_i(ctx, MIPS_R_T4, imm); ++ emit_alu_r64(ctx, dst, MIPS_R_T4, BPF_OP(code)); ++ } else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) { ++ emit_alu_i64(ctx, dst, val, alu); ++ } ++ break; ++ /* dst = dst & src (64-bit) */ ++ /* dst = dst | src (64-bit) */ ++ /* dst = dst ^ src (64-bit) */ ++ /* dst = dst << src (64-bit) */ ++ /* dst = dst >> src (64-bit) */ ++ /* dst = dst >> src (64-bit, arithmetic) */ ++ /* dst = dst + src (64-bit) */ ++ /* dst = dst - src (64-bit) */ ++ /* dst = dst * src (64-bit) */ ++ /* dst = dst / src (64-bit) */ ++ /* dst = dst % src (64-bit) */ ++ case BPF_ALU64 | BPF_AND | BPF_X: ++ case BPF_ALU64 | BPF_OR | BPF_X: ++ case BPF_ALU64 | BPF_XOR | BPF_X: ++ case BPF_ALU64 | BPF_LSH | BPF_X: ++ case BPF_ALU64 | BPF_RSH | BPF_X: ++ case BPF_ALU64 | BPF_ARSH | BPF_X: ++ case BPF_ALU64 | BPF_ADD | BPF_X: ++ case BPF_ALU64 | BPF_SUB | BPF_X: ++ case BPF_ALU64 | BPF_MUL | BPF_X: ++ case BPF_ALU64 | BPF_DIV | BPF_X: ++ case BPF_ALU64 | BPF_MOD | BPF_X: ++ emit_alu_r64(ctx, dst, src, BPF_OP(code)); ++ break; ++ /* dst = htole(dst) */ ++ /* dst = htobe(dst) */ ++ case BPF_ALU | BPF_END | BPF_FROM_LE: ++ case BPF_ALU | BPF_END | BPF_FROM_BE: ++ if (BPF_SRC(code) == ++#ifdef __BIG_ENDIAN ++ BPF_FROM_LE ++#else ++ BPF_FROM_BE ++#endif ++ ) ++ emit_bswap_r64(ctx, dst, imm); ++ else ++ emit_trunc_r64(ctx, dst, imm); ++ break; ++ /* dst = imm64 */ ++ case BPF_LD | BPF_IMM | BPF_DW: ++ emit_mov_i64(ctx, dst, (u32)imm | ((u64)insn[1].imm << 32)); ++ return 1; ++ /* LDX: dst = *(size *)(src + off) */ ++ case BPF_LDX | BPF_MEM | BPF_W: ++ case BPF_LDX | BPF_MEM | BPF_H: ++ case BPF_LDX | BPF_MEM | BPF_B: ++ case BPF_LDX | BPF_MEM | BPF_DW: ++ emit_ldx(ctx, dst, src, off, BPF_SIZE(code)); ++ break; ++ /* ST: *(size *)(dst + off) = imm */ ++ case BPF_ST | BPF_MEM | BPF_W: ++ case BPF_ST | BPF_MEM | BPF_H: ++ case BPF_ST | BPF_MEM | BPF_B: ++ case BPF_ST | BPF_MEM | BPF_DW: ++ emit_mov_i(ctx, MIPS_R_T4, imm); ++ emit_stx(ctx, dst, MIPS_R_T4, off, BPF_SIZE(code)); ++ break; ++ /* STX: *(size *)(dst + off) = src */ ++ case BPF_STX | BPF_MEM | BPF_W: ++ case BPF_STX | BPF_MEM | BPF_H: ++ case BPF_STX | BPF_MEM | BPF_B: ++ case BPF_STX | BPF_MEM | BPF_DW: ++ emit_stx(ctx, dst, src, off, BPF_SIZE(code)); ++ break; ++ /* Speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ /* Atomics */ ++ case BPF_STX | BPF_XADD | BPF_W: ++ case BPF_STX | BPF_XADD | BPF_DW: ++ switch (imm) { ++ case BPF_ADD: ++ case BPF_AND: ++ case BPF_OR: ++ case BPF_XOR: ++ if (BPF_SIZE(code) == BPF_DW) { ++ emit_atomic_r64(ctx, dst, src, off, imm); ++ } else { /* 32-bit, no fetch */ ++ emit_sext(ctx, MIPS_R_T4, src); ++ emit_atomic_r(ctx, dst, MIPS_R_T4, off, imm); ++ } ++ break; ++ default: ++ goto notyet; ++ } ++ break; ++ /* PC += off if dst == src */ ++ /* PC += off if dst != src */ ++ /* PC += off if dst & src */ ++ /* PC += off if dst > src */ ++ /* PC += off if dst >= src */ ++ /* PC += off if dst < src */ ++ /* PC += off if dst <= src */ ++ /* PC += off if dst > src (signed) */ ++ /* PC += off if dst >= src (signed) */ ++ /* PC += off if dst < src (signed) */ ++ /* PC += off if dst <= src (signed) */ ++ case BPF_JMP32 | BPF_JEQ | BPF_X: ++ case BPF_JMP32 | BPF_JNE | BPF_X: ++ case BPF_JMP32 | BPF_JSET | BPF_X: ++ case BPF_JMP32 | BPF_JGT | BPF_X: ++ case BPF_JMP32 | BPF_JGE | BPF_X: ++ case BPF_JMP32 | BPF_JLT | BPF_X: ++ case BPF_JMP32 | BPF_JLE | BPF_X: ++ case BPF_JMP32 | BPF_JSGT | BPF_X: ++ case BPF_JMP32 | BPF_JSGE | BPF_X: ++ case BPF_JMP32 | BPF_JSLT | BPF_X: ++ case BPF_JMP32 | BPF_JSLE | BPF_X: ++ if (off == 0) ++ break; ++ setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel); ++ emit_sext(ctx, MIPS_R_T4, dst); /* Sign-extended dst */ ++ emit_sext(ctx, MIPS_R_T5, src); /* Sign-extended src */ ++ emit_jmp_r(ctx, MIPS_R_T4, MIPS_R_T5, rel, jmp); ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off if dst == imm */ ++ /* PC += off if dst != imm */ ++ /* PC += off if dst & imm */ ++ /* PC += off if dst > imm */ ++ /* PC += off if dst >= imm */ ++ /* PC += off if dst < imm */ ++ /* PC += off if dst <= imm */ ++ /* PC += off if dst > imm (signed) */ ++ /* PC += off if dst >= imm (signed) */ ++ /* PC += off if dst < imm (signed) */ ++ /* PC += off if dst <= imm (signed) */ ++ case BPF_JMP32 | BPF_JEQ | BPF_K: ++ case BPF_JMP32 | BPF_JNE | BPF_K: ++ case BPF_JMP32 | BPF_JSET | BPF_K: ++ case BPF_JMP32 | BPF_JGT | BPF_K: ++ case BPF_JMP32 | BPF_JGE | BPF_K: ++ case BPF_JMP32 | BPF_JLT | BPF_K: ++ case BPF_JMP32 | BPF_JLE | BPF_K: ++ case BPF_JMP32 | BPF_JSGT | BPF_K: ++ case BPF_JMP32 | BPF_JSGE | BPF_K: ++ case BPF_JMP32 | BPF_JSLT | BPF_K: ++ case BPF_JMP32 | BPF_JSLE | BPF_K: ++ if (off == 0) ++ break; ++ setup_jmp_i(ctx, imm, 32, BPF_OP(code), off, &jmp, &rel); ++ emit_sext(ctx, MIPS_R_T4, dst); /* Sign-extended dst */ ++ if (valid_jmp_i(jmp, imm)) { ++ emit_jmp_i(ctx, MIPS_R_T4, imm, rel, jmp); ++ } else { ++ /* Move large immediate to register, sign-extended */ ++ emit_mov_i(ctx, MIPS_R_T5, imm); ++ emit_jmp_r(ctx, MIPS_R_T4, MIPS_R_T5, rel, jmp); ++ } ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off if dst == src */ ++ /* PC += off if dst != src */ ++ /* PC += off if dst & src */ ++ /* PC += off if dst > src */ ++ /* PC += off if dst >= src */ ++ /* PC += off if dst < src */ ++ /* PC += off if dst <= src */ ++ /* PC += off if dst > src (signed) */ ++ /* PC += off if dst >= src (signed) */ ++ /* PC += off if dst < src (signed) */ ++ /* PC += off if dst <= src (signed) */ ++ case BPF_JMP | BPF_JEQ | BPF_X: ++ case BPF_JMP | BPF_JNE | BPF_X: ++ case BPF_JMP | BPF_JSET | BPF_X: ++ case BPF_JMP | BPF_JGT | BPF_X: ++ case BPF_JMP | BPF_JGE | BPF_X: ++ case BPF_JMP | BPF_JLT | BPF_X: ++ case BPF_JMP | BPF_JLE | BPF_X: ++ case BPF_JMP | BPF_JSGT | BPF_X: ++ case BPF_JMP | BPF_JSGE | BPF_X: ++ case BPF_JMP | BPF_JSLT | BPF_X: ++ case BPF_JMP | BPF_JSLE | BPF_X: ++ if (off == 0) ++ break; ++ setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel); ++ emit_jmp_r(ctx, dst, src, rel, jmp); ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off if dst == imm */ ++ /* PC += off if dst != imm */ ++ /* PC += off if dst & imm */ ++ /* PC += off if dst > imm */ ++ /* PC += off if dst >= imm */ ++ /* PC += off if dst < imm */ ++ /* PC += off if dst <= imm */ ++ /* PC += off if dst > imm (signed) */ ++ /* PC += off if dst >= imm (signed) */ ++ /* PC += off if dst < imm (signed) */ ++ /* PC += off if dst <= imm (signed) */ ++ case BPF_JMP | BPF_JEQ | BPF_K: ++ case BPF_JMP | BPF_JNE | BPF_K: ++ case BPF_JMP | BPF_JSET | BPF_K: ++ case BPF_JMP | BPF_JGT | BPF_K: ++ case BPF_JMP | BPF_JGE | BPF_K: ++ case BPF_JMP | BPF_JLT | BPF_K: ++ case BPF_JMP | BPF_JLE | BPF_K: ++ case BPF_JMP | BPF_JSGT | BPF_K: ++ case BPF_JMP | BPF_JSGE | BPF_K: ++ case BPF_JMP | BPF_JSLT | BPF_K: ++ case BPF_JMP | BPF_JSLE | BPF_K: ++ if (off == 0) ++ break; ++ setup_jmp_i(ctx, imm, 64, BPF_OP(code), off, &jmp, &rel); ++ if (valid_jmp_i(jmp, imm)) { ++ emit_jmp_i(ctx, dst, imm, rel, jmp); ++ } else { ++ /* Move large immediate to register */ ++ emit_mov_i(ctx, MIPS_R_T4, imm); ++ emit_jmp_r(ctx, dst, MIPS_R_T4, rel, jmp); ++ } ++ if (finish_jmp(ctx, jmp, off) < 0) ++ goto toofar; ++ break; ++ /* PC += off */ ++ case BPF_JMP | BPF_JA: ++ if (off == 0) ++ break; ++ if (emit_ja(ctx, off) < 0) ++ goto toofar; ++ break; ++ /* Tail call */ ++ case BPF_JMP | BPF_TAIL_CALL: ++ if (emit_tail_call(ctx) < 0) ++ goto invalid; ++ break; ++ /* Function call */ ++ case BPF_JMP | BPF_CALL: ++ if (emit_call(ctx, insn) < 0) ++ goto invalid; ++ break; ++ /* Function return */ ++ case BPF_JMP | BPF_EXIT: ++ /* ++ * Optimization: when last instruction is EXIT ++ * simply continue to epilogue. ++ */ ++ if (ctx->bpf_index == ctx->program->len - 1) ++ break; ++ if (emit_exit(ctx) < 0) ++ goto toofar; ++ break; ++ ++ default: ++invalid: ++ pr_err_once("unknown opcode %02x\n", code); ++ return -EINVAL; ++notyet: ++ pr_info_once("*** NOT YET: opcode %02x ***\n", code); ++ return -EFAULT; ++toofar: ++ pr_info_once("*** TOO FAR: jump at %u opcode %02x ***\n", ++ ctx->bpf_index, code); ++ return -E2BIG; ++ } ++ return 0; ++} diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-04-mips-bpf-Add-JIT-workarounds-for-CPU-errata.patch b/root/target/linux/generic/backport-5.15/050-v5.16-04-mips-bpf-Add-JIT-workarounds-for-CPU-errata.patch new file mode 100755 index 00000000..63553ebe --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-04-mips-bpf-Add-JIT-workarounds-for-CPU-errata.patch @@ -0,0 +1,120 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:06 +0200 +Subject: [PATCH] mips: bpf: Add JIT workarounds for CPU errata + +This patch adds workarounds for the following CPU errata to the MIPS +eBPF JIT, if enabled in the kernel configuration. + + - R10000 ll/sc weak ordering + - Loongson-3 ll/sc weak ordering + - Loongson-2F jump hang + +The Loongson-2F nop errata is implemented in uasm, which the JIT uses, +so no additional mitigations are needed for that. + +Signed-off-by: Johan Almbladh +Reviewed-by: Jiaxun Yang +--- + +--- a/arch/mips/net/bpf_jit_comp.c ++++ b/arch/mips/net/bpf_jit_comp.c +@@ -404,6 +404,7 @@ void emit_alu_r(struct jit_context *ctx, + /* Atomic read-modify-write (32-bit) */ + void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code) + { ++ LLSC_sync(ctx); + emit(ctx, ll, MIPS_R_T9, off, dst); + switch (code) { + case BPF_ADD: +@@ -420,18 +421,19 @@ void emit_atomic_r(struct jit_context *c + break; + } + emit(ctx, sc, MIPS_R_T8, off, dst); +- emit(ctx, beqz, MIPS_R_T8, -16); ++ emit(ctx, LLSC_beqz, MIPS_R_T8, -16 - LLSC_offset); + emit(ctx, nop); /* Delay slot */ + } + + /* Atomic compare-and-exchange (32-bit) */ + void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off) + { ++ LLSC_sync(ctx); + emit(ctx, ll, MIPS_R_T9, off, dst); + emit(ctx, bne, MIPS_R_T9, res, 12); + emit(ctx, move, MIPS_R_T8, src); /* Delay slot */ + emit(ctx, sc, MIPS_R_T8, off, dst); +- emit(ctx, beqz, MIPS_R_T8, -20); ++ emit(ctx, LLSC_beqz, MIPS_R_T8, -20 - LLSC_offset); + emit(ctx, move, res, MIPS_R_T9); /* Delay slot */ + clobber_reg(ctx, res); + } +--- a/arch/mips/net/bpf_jit_comp.h ++++ b/arch/mips/net/bpf_jit_comp.h +@@ -87,7 +87,7 @@ struct jit_context { + }; + + /* Emit the instruction if the JIT memory space has been allocated */ +-#define emit(ctx, func, ...) \ ++#define __emit(ctx, func, ...) \ + do { \ + if ((ctx)->target != NULL) { \ + u32 *p = &(ctx)->target[ctx->jit_index]; \ +@@ -95,6 +95,30 @@ do { \ + } \ + (ctx)->jit_index++; \ + } while (0) ++#define emit(...) __emit(__VA_ARGS__) ++ ++/* Workaround for R10000 ll/sc errata */ ++#ifdef CONFIG_WAR_R10000 ++#define LLSC_beqz beqzl ++#else ++#define LLSC_beqz beqz ++#endif ++ ++/* Workaround for Loongson-3 ll/sc errata */ ++#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS ++#define LLSC_sync(ctx) emit(ctx, sync, 0) ++#define LLSC_offset 4 ++#else ++#define LLSC_sync(ctx) ++#define LLSC_offset 0 ++#endif ++ ++/* Workaround for Loongson-2F jump errata */ ++#ifdef CONFIG_CPU_JUMP_WORKAROUNDS ++#define JALR_MASK 0xffffffffcfffffffULL ++#else ++#define JALR_MASK (~0ULL) ++#endif + + /* + * Mark a BPF register as accessed, it needs to be +--- a/arch/mips/net/bpf_jit_comp64.c ++++ b/arch/mips/net/bpf_jit_comp64.c +@@ -375,6 +375,7 @@ static void emit_atomic_r64(struct jit_c + u8 t1 = MIPS_R_T6; + u8 t2 = MIPS_R_T7; + ++ LLSC_sync(ctx); + emit(ctx, lld, t1, off, dst); + switch (code) { + case BPF_ADD: +@@ -391,7 +392,7 @@ static void emit_atomic_r64(struct jit_c + break; + } + emit(ctx, scd, t2, off, dst); +- emit(ctx, beqz, t2, -16); ++ emit(ctx, LLSC_beqz, t2, -16 - LLSC_offset); + emit(ctx, nop); /* Delay slot */ + } + +@@ -414,7 +415,7 @@ static int emit_call(struct jit_context + push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0); + + /* Emit function call */ +- emit_mov_i64(ctx, tmp, addr); ++ emit_mov_i64(ctx, tmp, addr & JALR_MASK); + emit(ctx, jalr, MIPS_R_RA, tmp); + emit(ctx, nop); /* Delay slot */ + diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-05-mips-bpf-Enable-eBPF-JITs.patch b/root/target/linux/generic/backport-5.15/050-v5.16-05-mips-bpf-Enable-eBPF-JITs.patch new file mode 100755 index 00000000..00b3536b --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-05-mips-bpf-Enable-eBPF-JITs.patch @@ -0,0 +1,61 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:07 +0200 +Subject: [PATCH] mips: bpf: Enable eBPF JITs + +This patch enables the new eBPF JITs for 32-bit and 64-bit MIPS. It also +disables the old cBPF JIT to so cBPF programs are converted to use the +new JIT. + +Workarounds for R4000 CPU errata are not implemented by the JIT, so the +JIT is disabled if any of those workarounds are configured. + +Signed-off-by: Johan Almbladh +--- + +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -3428,6 +3428,7 @@ S: Supported + F: arch/arm64/net/ + + BPF JIT for MIPS (32-BIT AND 64-BIT) ++M: Johan Almbladh + M: Paul Burton + L: netdev@vger.kernel.org + L: bpf@vger.kernel.org +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -57,7 +57,6 @@ config MIPS + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES + select HAVE_ASM_MODVERSIONS +- select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS + select HAVE_CONTEXT_TRACKING + select HAVE_TIF_NOHZ + select HAVE_C_RECORDMCOUNT +@@ -65,7 +64,10 @@ config MIPS + select HAVE_DEBUG_STACKOVERFLOW + select HAVE_DMA_CONTIGUOUS + select HAVE_DYNAMIC_FTRACE +- select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2 ++ select HAVE_EBPF_JIT if !CPU_MICROMIPS && \ ++ !CPU_DADDI_WORKAROUNDS && \ ++ !CPU_R4000_WORKAROUNDS && \ ++ !CPU_R4400_WORKAROUNDS + select HAVE_EXIT_THREAD + select HAVE_FAST_GUP + select HAVE_FTRACE_MCOUNT_RECORD +--- a/arch/mips/net/Makefile ++++ b/arch/mips/net/Makefile +@@ -2,9 +2,10 @@ + # MIPS networking code + + obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o ++obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o + + ifeq ($(CONFIG_32BIT),y) +- obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o bpf_jit_comp32.o ++ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp32.o + else +- obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o ++ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp64.o + endif diff --git a/root/target/linux/generic/backport-5.15/050-v5.16-06-mips-bpf-Remove-old-BPF-JIT-implementations.patch b/root/target/linux/generic/backport-5.15/050-v5.16-06-mips-bpf-Remove-old-BPF-JIT-implementations.patch new file mode 100755 index 00000000..e25c3368 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/050-v5.16-06-mips-bpf-Remove-old-BPF-JIT-implementations.patch @@ -0,0 +1,387 @@ +From: Johan Almbladh +Date: Tue, 5 Oct 2021 18:54:08 +0200 +Subject: [PATCH] mips: bpf: Remove old BPF JIT implementations + +This patch removes the old 32-bit cBPF and 64-bit eBPF JIT implementations. +They are replaced by a new eBPF implementation that supports both 32-bit +and 64-bit MIPS CPUs. + +Signed-off-by: Johan Almbladh +--- + delete mode 100644 arch/mips/net/bpf_jit.c + delete mode 100644 arch/mips/net/bpf_jit.h + delete mode 100644 arch/mips/net/bpf_jit_asm.S + delete mode 100644 arch/mips/net/ebpf_jit.c + +--- a/arch/mips/net/bpf_jit.h ++++ /dev/null +@@ -1,81 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-/* +- * Just-In-Time compiler for BPF filters on MIPS +- * +- * Copyright (c) 2014 Imagination Technologies Ltd. +- * Author: Markos Chandras +- */ +- +-#ifndef BPF_JIT_MIPS_OP_H +-#define BPF_JIT_MIPS_OP_H +- +-/* Registers used by JIT */ +-#define MIPS_R_ZERO 0 +-#define MIPS_R_V0 2 +-#define MIPS_R_A0 4 +-#define MIPS_R_A1 5 +-#define MIPS_R_T4 12 +-#define MIPS_R_T5 13 +-#define MIPS_R_T6 14 +-#define MIPS_R_T7 15 +-#define MIPS_R_S0 16 +-#define MIPS_R_S1 17 +-#define MIPS_R_S2 18 +-#define MIPS_R_S3 19 +-#define MIPS_R_S4 20 +-#define MIPS_R_S5 21 +-#define MIPS_R_S6 22 +-#define MIPS_R_S7 23 +-#define MIPS_R_SP 29 +-#define MIPS_R_RA 31 +- +-/* Conditional codes */ +-#define MIPS_COND_EQ 0x1 +-#define MIPS_COND_GE (0x1 << 1) +-#define MIPS_COND_GT (0x1 << 2) +-#define MIPS_COND_NE (0x1 << 3) +-#define MIPS_COND_ALL (0x1 << 4) +-/* Conditionals on X register or K immediate */ +-#define MIPS_COND_X (0x1 << 5) +-#define MIPS_COND_K (0x1 << 6) +- +-#define r_ret MIPS_R_V0 +- +-/* +- * Use 2 scratch registers to avoid pipeline interlocks. +- * There is no overhead during epilogue and prologue since +- * any of the $s0-$s6 registers will only be preserved if +- * they are going to actually be used. +- */ +-#define r_skb_hl MIPS_R_S0 /* skb header length */ +-#define r_skb_data MIPS_R_S1 /* skb actual data */ +-#define r_off MIPS_R_S2 +-#define r_A MIPS_R_S3 +-#define r_X MIPS_R_S4 +-#define r_skb MIPS_R_S5 +-#define r_M MIPS_R_S6 +-#define r_skb_len MIPS_R_S7 +-#define r_s0 MIPS_R_T4 /* scratch reg 1 */ +-#define r_s1 MIPS_R_T5 /* scratch reg 2 */ +-#define r_tmp_imm MIPS_R_T6 /* No need to preserve this */ +-#define r_tmp MIPS_R_T7 /* No need to preserve this */ +-#define r_zero MIPS_R_ZERO +-#define r_sp MIPS_R_SP +-#define r_ra MIPS_R_RA +- +-#ifndef __ASSEMBLY__ +- +-/* Declare ASM helpers */ +- +-#define DECLARE_LOAD_FUNC(func) \ +- extern u8 func(unsigned long *skb, int offset); \ +- extern u8 func##_negative(unsigned long *skb, int offset); \ +- extern u8 func##_positive(unsigned long *skb, int offset) +- +-DECLARE_LOAD_FUNC(sk_load_word); +-DECLARE_LOAD_FUNC(sk_load_half); +-DECLARE_LOAD_FUNC(sk_load_byte); +- +-#endif +- +-#endif /* BPF_JIT_MIPS_OP_H */ +--- a/arch/mips/net/bpf_jit_asm.S ++++ /dev/null +@@ -1,285 +0,0 @@ +-/* +- * bpf_jib_asm.S: Packet/header access helper functions for MIPS/MIPS64 BPF +- * compiler. +- * +- * Copyright (C) 2015 Imagination Technologies Ltd. +- * Author: Markos Chandras +- * +- * This program is free software; you can redistribute it and/or modify it +- * under the terms of the GNU General Public License as published by the +- * Free Software Foundation; version 2 of the License. +- */ +- +-#include +-#include +-#include +-#include "bpf_jit.h" +- +-/* ABI +- * +- * r_skb_hl skb header length +- * r_skb_data skb data +- * r_off(a1) offset register +- * r_A BPF register A +- * r_X PF register X +- * r_skb(a0) *skb +- * r_M *scratch memory +- * r_skb_le skb length +- * r_s0 Scratch register 0 +- * r_s1 Scratch register 1 +- * +- * On entry: +- * a0: *skb +- * a1: offset (imm or imm + X) +- * +- * All non-BPF-ABI registers are free for use. On return, we only +- * care about r_ret. The BPF-ABI registers are assumed to remain +- * unmodified during the entire filter operation. +- */ +- +-#define skb a0 +-#define offset a1 +-#define SKF_LL_OFF (-0x200000) /* Can't include linux/filter.h in assembly */ +- +- /* We know better :) so prevent assembler reordering etc */ +- .set noreorder +- +-#define is_offset_negative(TYPE) \ +- /* If offset is negative we have more work to do */ \ +- slti t0, offset, 0; \ +- bgtz t0, bpf_slow_path_##TYPE##_neg; \ +- /* Be careful what follows in DS. */ +- +-#define is_offset_in_header(SIZE, TYPE) \ +- /* Reading from header? */ \ +- addiu $r_s0, $r_skb_hl, -SIZE; \ +- slt t0, $r_s0, offset; \ +- bgtz t0, bpf_slow_path_##TYPE; \ +- +-LEAF(sk_load_word) +- is_offset_negative(word) +-FEXPORT(sk_load_word_positive) +- is_offset_in_header(4, word) +- /* Offset within header boundaries */ +- PTR_ADDU t1, $r_skb_data, offset +- .set reorder +- lw $r_A, 0(t1) +- .set noreorder +-#ifdef CONFIG_CPU_LITTLE_ENDIAN +-# if MIPS_ISA_REV >= 2 +- wsbh t0, $r_A +- rotr $r_A, t0, 16 +-# else +- sll t0, $r_A, 24 +- srl t1, $r_A, 24 +- srl t2, $r_A, 8 +- or t0, t0, t1 +- andi t2, t2, 0xff00 +- andi t1, $r_A, 0xff00 +- or t0, t0, t2 +- sll t1, t1, 8 +- or $r_A, t0, t1 +-# endif +-#endif +- jr $r_ra +- move $r_ret, zero +- END(sk_load_word) +- +-LEAF(sk_load_half) +- is_offset_negative(half) +-FEXPORT(sk_load_half_positive) +- is_offset_in_header(2, half) +- /* Offset within header boundaries */ +- PTR_ADDU t1, $r_skb_data, offset +- lhu $r_A, 0(t1) +-#ifdef CONFIG_CPU_LITTLE_ENDIAN +-# if MIPS_ISA_REV >= 2 +- wsbh $r_A, $r_A +-# else +- sll t0, $r_A, 8 +- srl t1, $r_A, 8 +- andi t0, t0, 0xff00 +- or $r_A, t0, t1 +-# endif +-#endif +- jr $r_ra +- move $r_ret, zero +- END(sk_load_half) +- +-LEAF(sk_load_byte) +- is_offset_negative(byte) +-FEXPORT(sk_load_byte_positive) +- is_offset_in_header(1, byte) +- /* Offset within header boundaries */ +- PTR_ADDU t1, $r_skb_data, offset +- lbu $r_A, 0(t1) +- jr $r_ra +- move $r_ret, zero +- END(sk_load_byte) +- +-/* +- * call skb_copy_bits: +- * (prototype in linux/skbuff.h) +- * +- * int skb_copy_bits(sk_buff *skb, int offset, void *to, int len) +- * +- * o32 mandates we leave 4 spaces for argument registers in case +- * the callee needs to use them. Even though we don't care about +- * the argument registers ourselves, we need to allocate that space +- * to remain ABI compliant since the callee may want to use that space. +- * We also allocate 2 more spaces for $r_ra and our return register (*to). +- * +- * n64 is a bit different. The *caller* will allocate the space to preserve +- * the arguments. So in 64-bit kernels, we allocate the 4-arg space for no +- * good reason but it does not matter that much really. +- * +- * (void *to) is returned in r_s0 +- * +- */ +-#ifdef CONFIG_CPU_LITTLE_ENDIAN +-#define DS_OFFSET(SIZE) (4 * SZREG) +-#else +-#define DS_OFFSET(SIZE) ((4 * SZREG) + (4 - SIZE)) +-#endif +-#define bpf_slow_path_common(SIZE) \ +- /* Quick check. Are we within reasonable boundaries? */ \ +- LONG_ADDIU $r_s1, $r_skb_len, -SIZE; \ +- sltu $r_s0, offset, $r_s1; \ +- beqz $r_s0, fault; \ +- /* Load 4th argument in DS */ \ +- LONG_ADDIU a3, zero, SIZE; \ +- PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \ +- PTR_LA t0, skb_copy_bits; \ +- PTR_S $r_ra, (5 * SZREG)($r_sp); \ +- /* Assign low slot to a2 */ \ +- PTR_ADDIU a2, $r_sp, DS_OFFSET(SIZE); \ +- jalr t0; \ +- /* Reset our destination slot (DS but it's ok) */ \ +- INT_S zero, (4 * SZREG)($r_sp); \ +- /* \ +- * skb_copy_bits returns 0 on success and -EFAULT \ +- * on error. Our data live in a2. Do not bother with \ +- * our data if an error has been returned. \ +- */ \ +- /* Restore our frame */ \ +- PTR_L $r_ra, (5 * SZREG)($r_sp); \ +- INT_L $r_s0, (4 * SZREG)($r_sp); \ +- bltz v0, fault; \ +- PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \ +- move $r_ret, zero; \ +- +-NESTED(bpf_slow_path_word, (6 * SZREG), $r_sp) +- bpf_slow_path_common(4) +-#ifdef CONFIG_CPU_LITTLE_ENDIAN +-# if MIPS_ISA_REV >= 2 +- wsbh t0, $r_s0 +- jr $r_ra +- rotr $r_A, t0, 16 +-# else +- sll t0, $r_s0, 24 +- srl t1, $r_s0, 24 +- srl t2, $r_s0, 8 +- or t0, t0, t1 +- andi t2, t2, 0xff00 +- andi t1, $r_s0, 0xff00 +- or t0, t0, t2 +- sll t1, t1, 8 +- jr $r_ra +- or $r_A, t0, t1 +-# endif +-#else +- jr $r_ra +- move $r_A, $r_s0 +-#endif +- +- END(bpf_slow_path_word) +- +-NESTED(bpf_slow_path_half, (6 * SZREG), $r_sp) +- bpf_slow_path_common(2) +-#ifdef CONFIG_CPU_LITTLE_ENDIAN +-# if MIPS_ISA_REV >= 2 +- jr $r_ra +- wsbh $r_A, $r_s0 +-# else +- sll t0, $r_s0, 8 +- andi t1, $r_s0, 0xff00 +- andi t0, t0, 0xff00 +- srl t1, t1, 8 +- jr $r_ra +- or $r_A, t0, t1 +-# endif +-#else +- jr $r_ra +- move $r_A, $r_s0 +-#endif +- +- END(bpf_slow_path_half) +- +-NESTED(bpf_slow_path_byte, (6 * SZREG), $r_sp) +- bpf_slow_path_common(1) +- jr $r_ra +- move $r_A, $r_s0 +- +- END(bpf_slow_path_byte) +- +-/* +- * Negative entry points +- */ +- .macro bpf_is_end_of_data +- li t0, SKF_LL_OFF +- /* Reading link layer data? */ +- slt t1, offset, t0 +- bgtz t1, fault +- /* Be careful what follows in DS. */ +- .endm +-/* +- * call skb_copy_bits: +- * (prototype in linux/filter.h) +- * +- * void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, +- * int k, unsigned int size) +- * +- * see above (bpf_slow_path_common) for ABI restrictions +- */ +-#define bpf_negative_common(SIZE) \ +- PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \ +- PTR_LA t0, bpf_internal_load_pointer_neg_helper; \ +- PTR_S $r_ra, (5 * SZREG)($r_sp); \ +- jalr t0; \ +- li a2, SIZE; \ +- PTR_L $r_ra, (5 * SZREG)($r_sp); \ +- /* Check return pointer */ \ +- beqz v0, fault; \ +- PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \ +- /* Preserve our pointer */ \ +- move $r_s0, v0; \ +- /* Set return value */ \ +- move $r_ret, zero; \ +- +-bpf_slow_path_word_neg: +- bpf_is_end_of_data +-NESTED(sk_load_word_negative, (6 * SZREG), $r_sp) +- bpf_negative_common(4) +- jr $r_ra +- lw $r_A, 0($r_s0) +- END(sk_load_word_negative) +- +-bpf_slow_path_half_neg: +- bpf_is_end_of_data +-NESTED(sk_load_half_negative, (6 * SZREG), $r_sp) +- bpf_negative_common(2) +- jr $r_ra +- lhu $r_A, 0($r_s0) +- END(sk_load_half_negative) +- +-bpf_slow_path_byte_neg: +- bpf_is_end_of_data +-NESTED(sk_load_byte_negative, (6 * SZREG), $r_sp) +- bpf_negative_common(1) +- jr $r_ra +- lbu $r_A, 0($r_s0) +- END(sk_load_byte_negative) +- +-fault: +- jr $r_ra +- addiu $r_ret, zero, 1 diff --git a/root/target/linux/generic/backport-5.15/734-v5.16-0001-net-bgmac-improve-handling-PHY.patch b/root/target/linux/generic/backport-5.15/734-v5.16-0001-net-bgmac-improve-handling-PHY.patch new file mode 100755 index 00000000..6788a2ec --- /dev/null +++ b/root/target/linux/generic/backport-5.15/734-v5.16-0001-net-bgmac-improve-handling-PHY.patch @@ -0,0 +1,84 @@ +From b5375509184dc23d2b7fa0c5ed8763899ccc9674 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= +Date: Sat, 2 Oct 2021 19:58:11 +0200 +Subject: [PATCH] net: bgmac: improve handling PHY +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +1. Use info from DT if available + +It allows describing for example a fixed link. It's more accurate than +just guessing there may be one (depending on a chipset). + +2. Verify PHY ID before trying to connect PHY + +PHY addr 0x1e (30) is special in Broadcom routers and means a switch +connected as MDIO devices instead of a real PHY. Don't try connecting to +it. + +Signed-off-by: RafaÅ‚ MiÅ‚ecki +Signed-off-by: David S. Miller +--- + drivers/net/ethernet/broadcom/bgmac-bcma.c | 33 ++++++++++++++-------- + 1 file changed, 21 insertions(+), 12 deletions(-) + +--- a/drivers/net/ethernet/broadcom/bgmac-bcma.c ++++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include "bgmac.h" + +@@ -86,17 +87,28 @@ static int bcma_phy_connect(struct bgmac + struct phy_device *phy_dev; + char bus_id[MII_BUS_ID_SIZE + 3]; + ++ /* DT info should be the most accurate */ ++ phy_dev = of_phy_get_and_connect(bgmac->net_dev, bgmac->dev->of_node, ++ bgmac_adjust_link); ++ if (phy_dev) ++ return 0; ++ + /* Connect to the PHY */ +- snprintf(bus_id, sizeof(bus_id), PHY_ID_FMT, bgmac->mii_bus->id, +- bgmac->phyaddr); +- phy_dev = phy_connect(bgmac->net_dev, bus_id, bgmac_adjust_link, +- PHY_INTERFACE_MODE_MII); +- if (IS_ERR(phy_dev)) { +- dev_err(bgmac->dev, "PHY connection failed\n"); +- return PTR_ERR(phy_dev); ++ if (bgmac->mii_bus && bgmac->phyaddr != BGMAC_PHY_NOREGS) { ++ snprintf(bus_id, sizeof(bus_id), PHY_ID_FMT, bgmac->mii_bus->id, ++ bgmac->phyaddr); ++ phy_dev = phy_connect(bgmac->net_dev, bus_id, bgmac_adjust_link, ++ PHY_INTERFACE_MODE_MII); ++ if (IS_ERR(phy_dev)) { ++ dev_err(bgmac->dev, "PHY connection failed\n"); ++ return PTR_ERR(phy_dev); ++ } ++ ++ return 0; + } + +- return 0; ++ /* Assume a fixed link to the switch port */ ++ return bgmac_phy_connect_direct(bgmac); + } + + static const struct bcma_device_id bgmac_bcma_tbl[] = { +@@ -297,10 +309,7 @@ static int bgmac_probe(struct bcma_devic + bgmac->cco_ctl_maskset = bcma_bgmac_cco_ctl_maskset; + bgmac->get_bus_clock = bcma_bgmac_get_bus_clock; + bgmac->cmn_maskset32 = bcma_bgmac_cmn_maskset32; +- if (bgmac->mii_bus) +- bgmac->phy_connect = bcma_phy_connect; +- else +- bgmac->phy_connect = bgmac_phy_connect_direct; ++ bgmac->phy_connect = bcma_phy_connect; + + err = bgmac_enet_probe(bgmac); + if (err) diff --git a/root/target/linux/generic/backport-5.15/734-v5.16-0002-net-bgmac-support-MDIO-described-in-DT.patch b/root/target/linux/generic/backport-5.15/734-v5.16-0002-net-bgmac-support-MDIO-described-in-DT.patch new file mode 100755 index 00000000..f1348282 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/734-v5.16-0002-net-bgmac-support-MDIO-described-in-DT.patch @@ -0,0 +1,54 @@ +From 45c9d966688e7fad7f24bfc450547d91e4304d0b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= +Date: Sat, 2 Oct 2021 19:58:12 +0200 +Subject: [PATCH] net: bgmac: support MDIO described in DT +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Check ethernet controller DT node for "mdio" subnode and use it with +of_mdiobus_register() when present. That allows specifying MDIO and its +PHY devices in a standard DT based way. + +This is required for BCM53573 SoC support. That family is sometimes +called Northstar (by marketing?) but is quite different from it. It uses +different CPU(s) and many different hw blocks. + +One of shared blocks in BCM53573 is Ethernet controller. Switch however +is not SRAB accessible (as it Northstar) but is MDIO attached. + +Signed-off-by: RafaÅ‚ MiÅ‚ecki +Signed-off-by: David S. Miller +--- + drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c ++++ b/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c +@@ -10,6 +10,7 @@ + + #include + #include ++#include + #include "bgmac.h" + + static bool bcma_mdio_wait_value(struct bcma_device *core, u16 reg, u32 mask, +@@ -211,6 +212,7 @@ struct mii_bus *bcma_mdio_mii_register(s + { + struct bcma_device *core = bgmac->bcma.core; + struct mii_bus *mii_bus; ++ struct device_node *np; + int err; + + mii_bus = mdiobus_alloc(); +@@ -229,7 +231,9 @@ struct mii_bus *bcma_mdio_mii_register(s + mii_bus->parent = &core->dev; + mii_bus->phy_mask = ~(1 << bgmac->phyaddr); + +- err = mdiobus_register(mii_bus); ++ np = of_get_child_by_name(core->dev.of_node, "mdio"); ++ ++ err = of_mdiobus_register(mii_bus, np); + if (err) { + dev_err(&core->dev, "Registration of mii bus failed\n"); + goto err_free_bus; diff --git a/root/target/linux/generic/backport-5.15/742-v5.16-net-phy-at803x-add-support-for-qca-8327-internal-phy.patch b/root/target/linux/generic/backport-5.15/742-v5.16-net-phy-at803x-add-support-for-qca-8327-internal-phy.patch new file mode 100755 index 00000000..e4bead89 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/742-v5.16-net-phy-at803x-add-support-for-qca-8327-internal-phy.patch @@ -0,0 +1,48 @@ +From 0ccf8511182436183c031e8a2f740ae91a02c625 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Tue, 14 Sep 2021 14:33:45 +0200 +Subject: net: phy: at803x: add support for qca 8327 internal phy + +Add support for qca8327 internal phy needed for correct init of the +switch port. It does use the same qca8337 function and reg just with a +different id. + +Signed-off-by: Ansuel Smith +Tested-by: Rosen Penev +Tested-by: Andrew Lunn +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -1420,6 +1420,19 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, ++}, { ++ /* QCA8327 */ ++ .phy_id = QCA8327_PHY_ID, ++ .phy_id_mask = QCA8K_PHY_ID_MASK, ++ .name = "QCA PHY 8327", ++ /* PHY_GBIT_FEATURES */ ++ .probe = at803x_probe, ++ .flags = PHY_IS_INTERNAL, ++ .config_init = qca83xx_config_init, ++ .soft_reset = genphy_soft_reset, ++ .get_sset_count = at803x_get_sset_count, ++ .get_strings = at803x_get_strings, ++ .get_stats = at803x_get_stats, + }, }; + + module_phy_driver(at803x_driver); +@@ -1430,6 +1443,8 @@ static struct mdio_device_id __maybe_unu + { PHY_ID_MATCH_EXACT(ATH8032_PHY_ID) }, + { PHY_ID_MATCH_EXACT(ATH8035_PHY_ID) }, + { PHY_ID_MATCH_EXACT(ATH9331_PHY_ID) }, ++ { PHY_ID_MATCH_EXACT(QCA8337_PHY_ID) }, ++ { PHY_ID_MATCH_EXACT(QCA8327_PHY_ID) }, + { } + }; + diff --git a/root/target/linux/generic/backport-5.15/743-v5.16-0001-net-dsa-b53-Include-all-ports-in-enabled_ports.patch b/root/target/linux/generic/backport-5.15/743-v5.16-0001-net-dsa-b53-Include-all-ports-in-enabled_ports.patch new file mode 100755 index 00000000..eb84b45b --- /dev/null +++ b/root/target/linux/generic/backport-5.15/743-v5.16-0001-net-dsa-b53-Include-all-ports-in-enabled_ports.patch @@ -0,0 +1,131 @@ +From 983d96a9116a328668601555d96736261d33170c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= +Date: Thu, 16 Sep 2021 14:03:51 +0200 +Subject: [PATCH] net: dsa: b53: Include all ports in "enabled_ports" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Make "enabled_ports" bitfield contain all available switch ports +including a CPU port. This way there is no need for fixup during +initialization. + +For BCM53010, BCM53018 and BCM53019 include also other available ports. + +Signed-off-by: RafaÅ‚ MiÅ‚ecki +Reviewed-by: Florian Fainelli +Tested-by: Florian Fainelli +Signed-off-by: Jakub Kicinski +--- + drivers/net/dsa/b53/b53_common.c | 23 +++++++++++------------ + 1 file changed, 11 insertions(+), 12 deletions(-) + +--- a/drivers/net/dsa/b53/b53_common.c ++++ b/drivers/net/dsa/b53/b53_common.c +@@ -2302,7 +2302,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM5325_DEVICE_ID, + .dev_name = "BCM5325", + .vlans = 16, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x3f, + .arl_bins = 2, + .arl_buckets = 1024, + .imp_port = 5, +@@ -2313,7 +2313,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM5365_DEVICE_ID, + .dev_name = "BCM5365", + .vlans = 256, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x3f, + .arl_bins = 2, + .arl_buckets = 1024, + .imp_port = 5, +@@ -2324,7 +2324,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM5389_DEVICE_ID, + .dev_name = "BCM5389", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x11f, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2338,7 +2338,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM5395_DEVICE_ID, + .dev_name = "BCM5395", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x11f, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2352,7 +2352,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM5397_DEVICE_ID, + .dev_name = "BCM5397", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x11f, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2366,7 +2366,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM5398_DEVICE_ID, + .dev_name = "BCM5398", + .vlans = 4096, +- .enabled_ports = 0x7f, ++ .enabled_ports = 0x17f, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2380,7 +2380,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM53115_DEVICE_ID, + .dev_name = "BCM53115", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x11f, + .arl_bins = 4, + .arl_buckets = 1024, + .vta_regs = B53_VTA_REGS, +@@ -2394,7 +2394,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM53125_DEVICE_ID, + .dev_name = "BCM53125", + .vlans = 4096, +- .enabled_ports = 0xff, ++ .enabled_ports = 0x1ff, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2436,7 +2436,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM53010_DEVICE_ID, + .dev_name = "BCM53010", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x1bf, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2478,7 +2478,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM53018_DEVICE_ID, + .dev_name = "BCM53018", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x1bf, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2492,7 +2492,7 @@ static const struct b53_chip_data b53_sw + .chip_id = BCM53019_DEVICE_ID, + .dev_name = "BCM53019", + .vlans = 4096, +- .enabled_ports = 0x1f, ++ .enabled_ports = 0x1bf, + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +@@ -2634,7 +2634,6 @@ static int b53_switch_init(struct b53_de + dev->cpu_port = 5; + } + +- dev->enabled_ports |= BIT(dev->cpu_port); + dev->num_ports = fls(dev->enabled_ports); + + dev->ds->num_ports = min_t(unsigned int, dev->num_ports, DSA_MAX_PORTS); diff --git a/root/target/linux/generic/backport-5.15/743-v5.16-0002-net-dsa-b53-Drop-BCM5301x-workaround-for-a-wrong-CPU.patch b/root/target/linux/generic/backport-5.15/743-v5.16-0002-net-dsa-b53-Drop-BCM5301x-workaround-for-a-wrong-CPU.patch new file mode 100755 index 00000000..23805a90 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/743-v5.16-0002-net-dsa-b53-Drop-BCM5301x-workaround-for-a-wrong-CPU.patch @@ -0,0 +1,42 @@ +From b290c6384afabbca5ae6e2af72fb1b2bc37922be Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= +Date: Thu, 16 Sep 2021 14:03:52 +0200 +Subject: [PATCH] net: dsa: b53: Drop BCM5301x workaround for a wrong CPU/IMP + port +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +On BCM5301x port 8 requires a fixed link when used. + +Years ago when b53 was an OpenWrt downstream driver (with configuration +based on sometimes bugged NVRAM) there was a need for a fixup. In case +of forcing fixed link for (incorrectly specified) port 5 the code had to +actually setup port 8 link. + +For upstream b53 driver with setup based on DT there is no need for that +workaround. In DT we have and require correct ports setup. + +Signed-off-by: RafaÅ‚ MiÅ‚ecki +Reviewed-by: Florian Fainelli +Tested-by: Florian Fainelli +Signed-off-by: Jakub Kicinski +--- + drivers/net/dsa/b53/b53_common.c | 6 ------ + 1 file changed, 6 deletions(-) + +--- a/drivers/net/dsa/b53/b53_common.c ++++ b/drivers/net/dsa/b53/b53_common.c +@@ -1291,12 +1291,6 @@ static void b53_adjust_link(struct dsa_s + return; + } + } +- } else if (is5301x(dev)) { +- if (port != dev->cpu_port) { +- b53_force_port_config(dev, dev->cpu_port, 2000, +- DUPLEX_FULL, true, true); +- b53_force_link(dev, dev->cpu_port, 1); +- } + } + + /* Re-negotiate EEE if it was enabled already */ diff --git a/root/target/linux/generic/backport-5.15/743-v5.16-0003-net-dsa-b53-Improve-flow-control-setup-on-BCM5301x.patch b/root/target/linux/generic/backport-5.15/743-v5.16-0003-net-dsa-b53-Improve-flow-control-setup-on-BCM5301x.patch new file mode 100755 index 00000000..941fa23e --- /dev/null +++ b/root/target/linux/generic/backport-5.15/743-v5.16-0003-net-dsa-b53-Improve-flow-control-setup-on-BCM5301x.patch @@ -0,0 +1,32 @@ +From 3ff26b29230c54fea2353b63124c589b61953e14 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= +Date: Thu, 16 Sep 2021 14:03:53 +0200 +Subject: [PATCH] net: dsa: b53: Improve flow control setup on BCM5301x +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +According to the Broadcom's reference driver flow control needs to be +enabled for any CPU switch port (5, 7 or 8 - depending on which one is +used). Current code makes it work only for the port 5. Use +dsa_is_cpu_port() which solved that problem. + +Signed-off-by: RafaÅ‚ MiÅ‚ecki +Reviewed-by: Florian Fainelli +Tested-by: Florian Fainelli +Signed-off-by: Jakub Kicinski +--- + drivers/net/dsa/b53/b53_common.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/dsa/b53/b53_common.c ++++ b/drivers/net/dsa/b53/b53_common.c +@@ -1222,7 +1222,7 @@ static void b53_adjust_link(struct dsa_s + return; + + /* Enable flow control on BCM5301x's CPU port */ +- if (is5301x(dev) && port == dev->cpu_port) ++ if (is5301x(dev) && dsa_is_cpu_port(ds, port)) + tx_pause = rx_pause = true; + + if (phydev->pause) { diff --git a/root/target/linux/generic/backport-5.15/743-v5.16-0004-net-dsa-b53-Drop-unused-cpu_port-field.patch b/root/target/linux/generic/backport-5.15/743-v5.16-0004-net-dsa-b53-Drop-unused-cpu_port-field.patch new file mode 100755 index 00000000..746a1e39 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/743-v5.16-0004-net-dsa-b53-Drop-unused-cpu_port-field.patch @@ -0,0 +1,205 @@ +From 7d5af56418d7d01e43247a33b6fe6492ea871923 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= +Date: Thu, 16 Sep 2021 14:03:54 +0200 +Subject: [PATCH] net: dsa: b53: Drop unused "cpu_port" field +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It's set but never used anymore. + +Signed-off-by: RafaÅ‚ MiÅ‚ecki +Reviewed-by: Florian Fainelli +Tested-by: Florian Fainelli +Signed-off-by: Jakub Kicinski +--- + drivers/net/dsa/b53/b53_common.c | 28 ---------------------------- + drivers/net/dsa/b53/b53_priv.h | 1 - + 2 files changed, 29 deletions(-) + +--- a/drivers/net/dsa/b53/b53_common.c ++++ b/drivers/net/dsa/b53/b53_common.c +@@ -2300,7 +2300,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 2, + .arl_buckets = 1024, + .imp_port = 5, +- .cpu_port = B53_CPU_PORT_25, + .duplex_reg = B53_DUPLEX_STAT_FE, + }, + { +@@ -2311,7 +2310,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 2, + .arl_buckets = 1024, + .imp_port = 5, +- .cpu_port = B53_CPU_PORT_25, + .duplex_reg = B53_DUPLEX_STAT_FE, + }, + { +@@ -2322,7 +2320,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2336,7 +2333,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2350,7 +2346,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS_9798, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2364,7 +2359,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS_9798, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2379,7 +2373,6 @@ static const struct b53_chip_data b53_sw + .arl_buckets = 1024, + .vta_regs = B53_VTA_REGS, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, + .jumbo_size_reg = B53_JUMBO_MAX_SIZE, +@@ -2392,7 +2385,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2406,7 +2398,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2420,7 +2411,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS_63XX, + .duplex_reg = B53_DUPLEX_STAT_63XX, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK_63XX, +@@ -2434,7 +2424,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2448,7 +2437,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2462,7 +2450,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2476,7 +2463,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2490,7 +2476,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2504,7 +2489,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2518,7 +2502,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2547,7 +2530,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 1024, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2561,7 +2543,6 @@ static const struct b53_chip_data b53_sw + .arl_bins = 4, + .arl_buckets = 256, + .imp_port = 8, +- .cpu_port = B53_CPU_PORT, + .vta_regs = B53_VTA_REGS, + .duplex_reg = B53_DUPLEX_STAT_GE, + .jumbo_pm_reg = B53_JUMBO_PORT_MASK, +@@ -2587,7 +2568,6 @@ static int b53_switch_init(struct b53_de + dev->vta_regs[2] = chip->vta_regs[2]; + dev->jumbo_pm_reg = chip->jumbo_pm_reg; + dev->imp_port = chip->imp_port; +- dev->cpu_port = chip->cpu_port; + dev->num_vlans = chip->vlans; + dev->num_arl_bins = chip->arl_bins; + dev->num_arl_buckets = chip->arl_buckets; +@@ -2619,13 +2599,6 @@ static int b53_switch_init(struct b53_de + break; + #endif + } +- } else if (dev->chip_id == BCM53115_DEVICE_ID) { +- u64 strap_value; +- +- b53_read48(dev, B53_STAT_PAGE, B53_STRAP_VALUE, &strap_value); +- /* use second IMP port if GMII is enabled */ +- if (strap_value & SV_GMII_CTRL_115) +- dev->cpu_port = 5; + } + + dev->num_ports = fls(dev->enabled_ports); +--- a/drivers/net/dsa/b53/b53_priv.h ++++ b/drivers/net/dsa/b53/b53_priv.h +@@ -124,7 +124,6 @@ struct b53_device { + /* used ports mask */ + u16 enabled_ports; + unsigned int imp_port; +- unsigned int cpu_port; + + /* connect specific data */ + u8 current_page; diff --git a/root/target/linux/generic/backport-5.15/745-v5.16-01-net-phy-at803x-add-support-for-qca-8327-A-variant.patch b/root/target/linux/generic/backport-5.15/745-v5.16-01-net-phy-at803x-add-support-for-qca-8327-A-variant.patch new file mode 100755 index 00000000..07428364 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/745-v5.16-01-net-phy-at803x-add-support-for-qca-8327-A-variant.patch @@ -0,0 +1,65 @@ +From b4df02b562f4aa14ff6811f30e1b4d2159585c59 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 19 Sep 2021 18:28:15 +0200 +Subject: net: phy: at803x: add support for qca 8327 A variant internal phy + +For qca8327 internal phy there are 2 different switch variant with 2 +different phy id. Add this missing variant so the internal phy can be +correctly identified and fixed. + +Signed-off-by: Ansuel Smith +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 25 ++++++++++++++++++++----- + 1 file changed, 20 insertions(+), 5 deletions(-) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -150,7 +150,8 @@ + #define ATH8035_PHY_ID 0x004dd072 + #define AT8030_PHY_ID_MASK 0xffffffef + +-#define QCA8327_PHY_ID 0x004dd034 ++#define QCA8327_A_PHY_ID 0x004dd033 ++#define QCA8327_B_PHY_ID 0x004dd034 + #define QCA8337_PHY_ID 0x004dd036 + #define QCA8K_PHY_ID_MASK 0xffffffff + +@@ -1421,10 +1422,23 @@ static struct phy_driver at803x_driver[] + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, + }, { +- /* QCA8327 */ +- .phy_id = QCA8327_PHY_ID, ++ /* QCA8327-A from switch QCA8327-AL1A */ ++ .phy_id = QCA8327_A_PHY_ID, + .phy_id_mask = QCA8K_PHY_ID_MASK, +- .name = "QCA PHY 8327", ++ .name = "QCA PHY 8327-A", ++ /* PHY_GBIT_FEATURES */ ++ .probe = at803x_probe, ++ .flags = PHY_IS_INTERNAL, ++ .config_init = qca83xx_config_init, ++ .soft_reset = genphy_soft_reset, ++ .get_sset_count = at803x_get_sset_count, ++ .get_strings = at803x_get_strings, ++ .get_stats = at803x_get_stats, ++}, { ++ /* QCA8327-B from switch QCA8327-BL1A */ ++ .phy_id = QCA8327_B_PHY_ID, ++ .phy_id_mask = QCA8K_PHY_ID_MASK, ++ .name = "QCA PHY 8327-B", + /* PHY_GBIT_FEATURES */ + .probe = at803x_probe, + .flags = PHY_IS_INTERNAL, +@@ -1444,7 +1458,8 @@ static struct mdio_device_id __maybe_unu + { PHY_ID_MATCH_EXACT(ATH8035_PHY_ID) }, + { PHY_ID_MATCH_EXACT(ATH9331_PHY_ID) }, + { PHY_ID_MATCH_EXACT(QCA8337_PHY_ID) }, +- { PHY_ID_MATCH_EXACT(QCA8327_PHY_ID) }, ++ { PHY_ID_MATCH_EXACT(QCA8327_A_PHY_ID) }, ++ { PHY_ID_MATCH_EXACT(QCA8327_B_PHY_ID) }, + { } + }; + diff --git a/root/target/linux/generic/backport-5.15/745-v5.16-02-net-phy-at803x-add-resume-suspend-function-to-qca83x.patch b/root/target/linux/generic/backport-5.15/745-v5.16-02-net-phy-at803x-add-resume-suspend-function-to-qca83x.patch new file mode 100755 index 00000000..a572a318 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/745-v5.16-02-net-phy-at803x-add-resume-suspend-function-to-qca83x.patch @@ -0,0 +1,45 @@ +From 15b9df4ece17d084f14eb0ca1cf05f2ad497e425 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 19 Sep 2021 18:28:16 +0200 +Subject: net: phy: at803x: add resume/suspend function to qca83xx phy + +Add resume/suspend function to qca83xx internal phy. +We can't use the at803x generic function as the documentation lacks of +any support for WoL regs. + +Signed-off-by: Ansuel Smith +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -1421,6 +1421,8 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, ++ .suspend = genphy_suspend, ++ .resume = genphy_resume, + }, { + /* QCA8327-A from switch QCA8327-AL1A */ + .phy_id = QCA8327_A_PHY_ID, +@@ -1434,6 +1436,8 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, ++ .suspend = genphy_suspend, ++ .resume = genphy_resume, + }, { + /* QCA8327-B from switch QCA8327-BL1A */ + .phy_id = QCA8327_B_PHY_ID, +@@ -1447,6 +1451,8 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, ++ .suspend = genphy_suspend, ++ .resume = genphy_resume, + }, }; + + module_phy_driver(at803x_driver); diff --git a/root/target/linux/generic/backport-5.15/745-v5.16-03-net-phy-at803x-fix-spacing-and-improve-name-for-83xx.patch b/root/target/linux/generic/backport-5.15/745-v5.16-03-net-phy-at803x-fix-spacing-and-improve-name-for-83xx.patch new file mode 100755 index 00000000..45c1ff27 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/745-v5.16-03-net-phy-at803x-fix-spacing-and-improve-name-for-83xx.patch @@ -0,0 +1,95 @@ +From d44fd8604a4ab92119adb35f05fd87612af722b5 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 19 Sep 2021 18:28:17 +0200 +Subject: net: phy: at803x: fix spacing and improve name for 83xx phy + +Fix spacing and improve name for 83xx phy following other phy in the +same driver. + +Signed-off-by: Ansuel Smith +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 60 ++++++++++++++++++++++++------------------------ + 1 file changed, 30 insertions(+), 30 deletions(-) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -1410,47 +1410,47 @@ static struct phy_driver at803x_driver[] + .config_aneg = at803x_config_aneg, + }, { + /* QCA8337 */ +- .phy_id = QCA8337_PHY_ID, +- .phy_id_mask = QCA8K_PHY_ID_MASK, +- .name = "QCA PHY 8337", ++ .phy_id = QCA8337_PHY_ID, ++ .phy_id_mask = QCA8K_PHY_ID_MASK, ++ .name = "Qualcomm Atheros 8337 internal PHY", + /* PHY_GBIT_FEATURES */ +- .probe = at803x_probe, +- .flags = PHY_IS_INTERNAL, +- .config_init = qca83xx_config_init, +- .soft_reset = genphy_soft_reset, +- .get_sset_count = at803x_get_sset_count, +- .get_strings = at803x_get_strings, +- .get_stats = at803x_get_stats, ++ .probe = at803x_probe, ++ .flags = PHY_IS_INTERNAL, ++ .config_init = qca83xx_config_init, ++ .soft_reset = genphy_soft_reset, ++ .get_sset_count = at803x_get_sset_count, ++ .get_strings = at803x_get_strings, ++ .get_stats = at803x_get_stats, + .suspend = genphy_suspend, + .resume = genphy_resume, + }, { + /* QCA8327-A from switch QCA8327-AL1A */ +- .phy_id = QCA8327_A_PHY_ID, +- .phy_id_mask = QCA8K_PHY_ID_MASK, +- .name = "QCA PHY 8327-A", ++ .phy_id = QCA8327_A_PHY_ID, ++ .phy_id_mask = QCA8K_PHY_ID_MASK, ++ .name = "Qualcomm Atheros 8327-A internal PHY", + /* PHY_GBIT_FEATURES */ +- .probe = at803x_probe, +- .flags = PHY_IS_INTERNAL, +- .config_init = qca83xx_config_init, +- .soft_reset = genphy_soft_reset, +- .get_sset_count = at803x_get_sset_count, +- .get_strings = at803x_get_strings, +- .get_stats = at803x_get_stats, ++ .probe = at803x_probe, ++ .flags = PHY_IS_INTERNAL, ++ .config_init = qca83xx_config_init, ++ .soft_reset = genphy_soft_reset, ++ .get_sset_count = at803x_get_sset_count, ++ .get_strings = at803x_get_strings, ++ .get_stats = at803x_get_stats, + .suspend = genphy_suspend, + .resume = genphy_resume, + }, { + /* QCA8327-B from switch QCA8327-BL1A */ +- .phy_id = QCA8327_B_PHY_ID, +- .phy_id_mask = QCA8K_PHY_ID_MASK, +- .name = "QCA PHY 8327-B", ++ .phy_id = QCA8327_B_PHY_ID, ++ .phy_id_mask = QCA8K_PHY_ID_MASK, ++ .name = "Qualcomm Atheros 8327-B internal PHY", + /* PHY_GBIT_FEATURES */ +- .probe = at803x_probe, +- .flags = PHY_IS_INTERNAL, +- .config_init = qca83xx_config_init, +- .soft_reset = genphy_soft_reset, +- .get_sset_count = at803x_get_sset_count, +- .get_strings = at803x_get_strings, +- .get_stats = at803x_get_stats, ++ .probe = at803x_probe, ++ .flags = PHY_IS_INTERNAL, ++ .config_init = qca83xx_config_init, ++ .soft_reset = genphy_soft_reset, ++ .get_sset_count = at803x_get_sset_count, ++ .get_strings = at803x_get_strings, ++ .get_stats = at803x_get_stats, + .suspend = genphy_suspend, + .resume = genphy_resume, + }, }; diff --git a/root/target/linux/generic/backport-5.15/746-v5.16-01-net-phy-at803x-fix-resume-for-QCA8327-phy.patch b/root/target/linux/generic/backport-5.15/746-v5.16-01-net-phy-at803x-fix-resume-for-QCA8327-phy.patch new file mode 100755 index 00000000..7f9182ab --- /dev/null +++ b/root/target/linux/generic/backport-5.15/746-v5.16-01-net-phy-at803x-fix-resume-for-QCA8327-phy.patch @@ -0,0 +1,131 @@ +From ba3c01ee02ed0d821c9f241f179bbc9457542b8f Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 10 Oct 2021 00:46:15 +0200 +Subject: net: phy: at803x: fix resume for QCA8327 phy + +From Documentation phy resume triggers phy reset and restart +auto-negotiation. Add a dedicated function to wait reset to finish as +it was notice a regression where port sometime are not reliable after a +suspend/resume session. The reset wait logic is copied from phy_poll_reset. +Add dedicated suspend function to use genphy_suspend only with QCA8337 +phy and set only additional debug settings for QCA8327. With more test +it was reported that QCA8327 doesn't proprely support this mode and +using this cause the unreliability of the switch ports, especially the +malfunction of the port0. + +Fixes: 15b9df4ece17 ("net: phy: at803x: add resume/suspend function to qca83xx phy") +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 69 +++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 63 insertions(+), 6 deletions(-) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -92,9 +92,14 @@ + #define AT803X_DEBUG_REG_5 0x05 + #define AT803X_DEBUG_TX_CLK_DLY_EN BIT(8) + ++#define AT803X_DEBUG_REG_HIB_CTRL 0x0b ++#define AT803X_DEBUG_HIB_CTRL_SEL_RST_80U BIT(10) ++#define AT803X_DEBUG_HIB_CTRL_EN_ANY_CHANGE BIT(13) ++ + #define AT803X_DEBUG_REG_3C 0x3C + + #define AT803X_DEBUG_REG_3D 0x3D ++#define AT803X_DEBUG_GATE_CLK_IN1000 BIT(6) + + #define AT803X_DEBUG_REG_1F 0x1F + #define AT803X_DEBUG_PLL_ON BIT(2) +@@ -1312,6 +1317,58 @@ static int qca83xx_config_init(struct ph + return 0; + } + ++static int qca83xx_resume(struct phy_device *phydev) ++{ ++ int ret, val; ++ ++ /* Skip reset if not suspended */ ++ if (!phydev->suspended) ++ return 0; ++ ++ /* Reinit the port, reset values set by suspend */ ++ qca83xx_config_init(phydev); ++ ++ /* Reset the port on port resume */ ++ phy_set_bits(phydev, MII_BMCR, BMCR_RESET | BMCR_ANENABLE); ++ ++ /* On resume from suspend the switch execute a reset and ++ * restart auto-negotiation. Wait for reset to complete. ++ */ ++ ret = phy_read_poll_timeout(phydev, MII_BMCR, val, !(val & BMCR_RESET), ++ 50000, 600000, true); ++ if (ret) ++ return ret; ++ ++ msleep(1); ++ ++ return 0; ++} ++ ++static int qca83xx_suspend(struct phy_device *phydev) ++{ ++ u16 mask = 0; ++ ++ /* Only QCA8337 support actual suspend. ++ * QCA8327 cause port unreliability when phy suspend ++ * is set. ++ */ ++ if (phydev->drv->phy_id == QCA8337_PHY_ID) { ++ genphy_suspend(phydev); ++ } else { ++ mask |= ~(BMCR_SPEED1000 | BMCR_FULLDPLX); ++ phy_modify(phydev, MII_BMCR, mask, 0); ++ } ++ ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_3D, ++ AT803X_DEBUG_GATE_CLK_IN1000, 0); ++ ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_HIB_CTRL, ++ AT803X_DEBUG_HIB_CTRL_EN_ANY_CHANGE | ++ AT803X_DEBUG_HIB_CTRL_SEL_RST_80U, 0); ++ ++ return 0; ++} ++ + static struct phy_driver at803x_driver[] = { + { + /* Qualcomm Atheros AR8035 */ +@@ -1421,8 +1478,8 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, +- .suspend = genphy_suspend, +- .resume = genphy_resume, ++ .suspend = qca83xx_suspend, ++ .resume = qca83xx_resume, + }, { + /* QCA8327-A from switch QCA8327-AL1A */ + .phy_id = QCA8327_A_PHY_ID, +@@ -1436,8 +1493,8 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, +- .suspend = genphy_suspend, +- .resume = genphy_resume, ++ .suspend = qca83xx_suspend, ++ .resume = qca83xx_resume, + }, { + /* QCA8327-B from switch QCA8327-BL1A */ + .phy_id = QCA8327_B_PHY_ID, +@@ -1451,8 +1508,8 @@ static struct phy_driver at803x_driver[] + .get_sset_count = at803x_get_sset_count, + .get_strings = at803x_get_strings, + .get_stats = at803x_get_stats, +- .suspend = genphy_suspend, +- .resume = genphy_resume, ++ .suspend = qca83xx_suspend, ++ .resume = qca83xx_resume, + }, }; + + module_phy_driver(at803x_driver); diff --git a/root/target/linux/generic/backport-5.15/746-v5.16-02-net-phy-at803x-add-DAC-amplitude-fix-for-8327-phy.patch b/root/target/linux/generic/backport-5.15/746-v5.16-02-net-phy-at803x-add-DAC-amplitude-fix-for-8327-phy.patch new file mode 100755 index 00000000..cfdfe2c4 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/746-v5.16-02-net-phy-at803x-add-DAC-amplitude-fix-for-8327-phy.patch @@ -0,0 +1,91 @@ +From 1ca8311949aec5c9447645731ef1c6bc5bd71350 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 10 Oct 2021 00:46:16 +0200 +Subject: net: phy: at803x: add DAC amplitude fix for 8327 phy + +QCA8327 internal phy require DAC amplitude adjustement set to +6% with +100m speed. Also add additional define to report a change of the same +reg in QCA8337. (different scope it does set 1000m voltage) +Add link_change_notify function to set the proper amplitude adjustement +on PHY_RUNNING state and disable on any other state. + +Fixes: b4df02b562f4 ("net: phy: at803x: add support for qca 8327 A variant internal phy") +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 33 +++++++++++++++++++++++++++++++++ + 1 file changed, 33 insertions(+) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -87,6 +87,8 @@ + #define AT803X_PSSR_MR_AN_COMPLETE 0x0200 + + #define AT803X_DEBUG_REG_0 0x00 ++#define QCA8327_DEBUG_MANU_CTRL_EN BIT(2) ++#define QCA8337_DEBUG_MANU_CTRL_EN GENMASK(3, 2) + #define AT803X_DEBUG_RX_CLK_DLY_EN BIT(15) + + #define AT803X_DEBUG_REG_5 0x05 +@@ -1314,9 +1316,37 @@ static int qca83xx_config_init(struct ph + break; + } + ++ /* QCA8327 require DAC amplitude adjustment for 100m set to +6%. ++ * Disable on init and enable only with 100m speed following ++ * qca original source code. ++ */ ++ if (phydev->drv->phy_id == QCA8327_A_PHY_ID || ++ phydev->drv->phy_id == QCA8327_B_PHY_ID) ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ QCA8327_DEBUG_MANU_CTRL_EN, 0); ++ + return 0; + } + ++static void qca83xx_link_change_notify(struct phy_device *phydev) ++{ ++ /* QCA8337 doesn't require DAC Amplitude adjustement */ ++ if (phydev->drv->phy_id == QCA8337_PHY_ID) ++ return; ++ ++ /* Set DAC Amplitude adjustment to +6% for 100m on link running */ ++ if (phydev->state == PHY_RUNNING) { ++ if (phydev->speed == SPEED_100) ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ QCA8327_DEBUG_MANU_CTRL_EN, ++ QCA8327_DEBUG_MANU_CTRL_EN); ++ } else { ++ /* Reset DAC Amplitude adjustment */ ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ QCA8327_DEBUG_MANU_CTRL_EN, 0); ++ } ++} ++ + static int qca83xx_resume(struct phy_device *phydev) + { + int ret, val; +@@ -1471,6 +1501,7 @@ static struct phy_driver at803x_driver[] + .phy_id_mask = QCA8K_PHY_ID_MASK, + .name = "Qualcomm Atheros 8337 internal PHY", + /* PHY_GBIT_FEATURES */ ++ .link_change_notify = qca83xx_link_change_notify, + .probe = at803x_probe, + .flags = PHY_IS_INTERNAL, + .config_init = qca83xx_config_init, +@@ -1486,6 +1517,7 @@ static struct phy_driver at803x_driver[] + .phy_id_mask = QCA8K_PHY_ID_MASK, + .name = "Qualcomm Atheros 8327-A internal PHY", + /* PHY_GBIT_FEATURES */ ++ .link_change_notify = qca83xx_link_change_notify, + .probe = at803x_probe, + .flags = PHY_IS_INTERNAL, + .config_init = qca83xx_config_init, +@@ -1501,6 +1533,7 @@ static struct phy_driver at803x_driver[] + .phy_id_mask = QCA8K_PHY_ID_MASK, + .name = "Qualcomm Atheros 8327-B internal PHY", + /* PHY_GBIT_FEATURES */ ++ .link_change_notify = qca83xx_link_change_notify, + .probe = at803x_probe, + .flags = PHY_IS_INTERNAL, + .config_init = qca83xx_config_init, diff --git a/root/target/linux/generic/backport-5.15/746-v5.16-03-net-phy-at803x-enable-prefer-master-for-83xx-interna.patch b/root/target/linux/generic/backport-5.15/746-v5.16-03-net-phy-at803x-enable-prefer-master-for-83xx-interna.patch new file mode 100755 index 00000000..71c1e60f --- /dev/null +++ b/root/target/linux/generic/backport-5.15/746-v5.16-03-net-phy-at803x-enable-prefer-master-for-83xx-interna.patch @@ -0,0 +1,27 @@ +From 9d1c29b4028557a496be9c5eb2b4b86063700636 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 10 Oct 2021 00:46:17 +0200 +Subject: net: phy: at803x: enable prefer master for 83xx internal phy + +From original QCA source code the port was set to prefer master as port +type in 1000BASE-T mode. Apply the same settings also here. + +Signed-off-by: Ansuel Smith +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -1325,6 +1325,9 @@ static int qca83xx_config_init(struct ph + at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, + QCA8327_DEBUG_MANU_CTRL_EN, 0); + ++ /* Following original QCA sourcecode set port to prefer master */ ++ phy_set_bits(phydev, MII_CTRL1000, CTL1000_PREFER_MASTER); ++ + return 0; + } + diff --git a/root/target/linux/generic/backport-5.15/746-v5.16-04-net-phy-at803x-better-describe-debug-regs.patch b/root/target/linux/generic/backport-5.15/746-v5.16-04-net-phy-at803x-better-describe-debug-regs.patch new file mode 100755 index 00000000..64163bfb --- /dev/null +++ b/root/target/linux/generic/backport-5.15/746-v5.16-04-net-phy-at803x-better-describe-debug-regs.patch @@ -0,0 +1,127 @@ +From 67999555ff42e91de7654488d9a7735bd9e84555 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 10 Oct 2021 00:46:18 +0200 +Subject: net: phy: at803x: better describe debug regs + +Give a name to known debug regs from Documentation instead of using +unknown hex values. + +Signed-off-by: Ansuel Smith +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +--- + drivers/net/phy/at803x.c | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -86,12 +86,12 @@ + #define AT803X_PSSR 0x11 /*PHY-Specific Status Register*/ + #define AT803X_PSSR_MR_AN_COMPLETE 0x0200 + +-#define AT803X_DEBUG_REG_0 0x00 ++#define AT803X_DEBUG_ANALOG_TEST_CTRL 0x00 + #define QCA8327_DEBUG_MANU_CTRL_EN BIT(2) + #define QCA8337_DEBUG_MANU_CTRL_EN GENMASK(3, 2) + #define AT803X_DEBUG_RX_CLK_DLY_EN BIT(15) + +-#define AT803X_DEBUG_REG_5 0x05 ++#define AT803X_DEBUG_SYSTEM_CTRL_MODE 0x05 + #define AT803X_DEBUG_TX_CLK_DLY_EN BIT(8) + + #define AT803X_DEBUG_REG_HIB_CTRL 0x0b +@@ -100,7 +100,7 @@ + + #define AT803X_DEBUG_REG_3C 0x3C + +-#define AT803X_DEBUG_REG_3D 0x3D ++#define AT803X_DEBUG_REG_GREEN 0x3D + #define AT803X_DEBUG_GATE_CLK_IN1000 BIT(6) + + #define AT803X_DEBUG_REG_1F 0x1F +@@ -284,25 +284,25 @@ static int at803x_read_page(struct phy_d + + static int at803x_enable_rx_delay(struct phy_device *phydev) + { +- return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, 0, ++ return at803x_debug_reg_mask(phydev, AT803X_DEBUG_ANALOG_TEST_CTRL, 0, + AT803X_DEBUG_RX_CLK_DLY_EN); + } + + static int at803x_enable_tx_delay(struct phy_device *phydev) + { +- return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_5, 0, ++ return at803x_debug_reg_mask(phydev, AT803X_DEBUG_SYSTEM_CTRL_MODE, 0, + AT803X_DEBUG_TX_CLK_DLY_EN); + } + + static int at803x_disable_rx_delay(struct phy_device *phydev) + { +- return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ return at803x_debug_reg_mask(phydev, AT803X_DEBUG_ANALOG_TEST_CTRL, + AT803X_DEBUG_RX_CLK_DLY_EN, 0); + } + + static int at803x_disable_tx_delay(struct phy_device *phydev) + { +- return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_5, ++ return at803x_debug_reg_mask(phydev, AT803X_DEBUG_SYSTEM_CTRL_MODE, + AT803X_DEBUG_TX_CLK_DLY_EN, 0); + } + +@@ -1300,9 +1300,9 @@ static int qca83xx_config_init(struct ph + switch (switch_revision) { + case 1: + /* For 100M waveform */ +- at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_0, 0x02ea); ++ at803x_debug_reg_write(phydev, AT803X_DEBUG_ANALOG_TEST_CTRL, 0x02ea); + /* Turn on Gigabit clock */ +- at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_3D, 0x68a0); ++ at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_GREEN, 0x68a0); + break; + + case 2: +@@ -1310,8 +1310,8 @@ static int qca83xx_config_init(struct ph + fallthrough; + case 4: + phy_write_mmd(phydev, MDIO_MMD_PCS, MDIO_AZ_DEBUG, 0x803f); +- at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_3D, 0x6860); +- at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_5, 0x2c46); ++ at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_GREEN, 0x6860); ++ at803x_debug_reg_write(phydev, AT803X_DEBUG_SYSTEM_CTRL_MODE, 0x2c46); + at803x_debug_reg_write(phydev, AT803X_DEBUG_REG_3C, 0x6000); + break; + } +@@ -1322,7 +1322,7 @@ static int qca83xx_config_init(struct ph + */ + if (phydev->drv->phy_id == QCA8327_A_PHY_ID || + phydev->drv->phy_id == QCA8327_B_PHY_ID) +- at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_ANALOG_TEST_CTRL, + QCA8327_DEBUG_MANU_CTRL_EN, 0); + + /* Following original QCA sourcecode set port to prefer master */ +@@ -1340,12 +1340,12 @@ static void qca83xx_link_change_notify(s + /* Set DAC Amplitude adjustment to +6% for 100m on link running */ + if (phydev->state == PHY_RUNNING) { + if (phydev->speed == SPEED_100) +- at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_ANALOG_TEST_CTRL, + QCA8327_DEBUG_MANU_CTRL_EN, + QCA8327_DEBUG_MANU_CTRL_EN); + } else { + /* Reset DAC Amplitude adjustment */ +- at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_ANALOG_TEST_CTRL, + QCA8327_DEBUG_MANU_CTRL_EN, 0); + } + } +@@ -1392,7 +1392,7 @@ static int qca83xx_suspend(struct phy_de + phy_modify(phydev, MII_BMCR, mask, 0); + } + +- at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_3D, ++ at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_GREEN, + AT803X_DEBUG_GATE_CLK_IN1000, 0); + + at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_HIB_CTRL, diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-01-dsa-qca8k-add-mac-power-sel-support.patch b/root/target/linux/generic/backport-5.15/747-v5.16-01-dsa-qca8k-add-mac-power-sel-support.patch new file mode 100755 index 00000000..c8d424de --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-01-dsa-qca8k-add-mac-power-sel-support.patch @@ -0,0 +1,80 @@ +From d8b6f5bae6d3b648a67b6958cb98e4e97256d652 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:06 +0200 +Subject: dsa: qca8k: add mac_power_sel support + +Add missing mac power sel support needed for ipq8064/5 SoC that require +1.8v for the internal regulator port instead of the default 1.5v. +If other device needs this, consider adding a dedicated binding to +support this. + +Signed-off-by: Ansuel Smith +Reviewed-by: Vladimir Oltean +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 31 +++++++++++++++++++++++++++++++ + drivers/net/dsa/qca8k.h | 5 +++++ + 2 files changed, 36 insertions(+) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -951,6 +951,33 @@ qca8k_setup_of_rgmii_delay(struct qca8k_ + } + + static int ++qca8k_setup_mac_pwr_sel(struct qca8k_priv *priv) ++{ ++ u32 mask = 0; ++ int ret = 0; ++ ++ /* SoC specific settings for ipq8064. ++ * If more device require this consider adding ++ * a dedicated binding. ++ */ ++ if (of_machine_is_compatible("qcom,ipq8064")) ++ mask |= QCA8K_MAC_PWR_RGMII0_1_8V; ++ ++ /* SoC specific settings for ipq8065 */ ++ if (of_machine_is_compatible("qcom,ipq8065")) ++ mask |= QCA8K_MAC_PWR_RGMII1_1_8V; ++ ++ if (mask) { ++ ret = qca8k_rmw(priv, QCA8K_REG_MAC_PWR_SEL, ++ QCA8K_MAC_PWR_RGMII0_1_8V | ++ QCA8K_MAC_PWR_RGMII1_1_8V, ++ mask); ++ } ++ ++ return ret; ++} ++ ++static int + qca8k_setup(struct dsa_switch *ds) + { + struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; +@@ -979,6 +1006,10 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + return ret; + ++ ret = qca8k_setup_mac_pwr_sel(priv); ++ if (ret) ++ return ret; ++ + /* Enable CPU Port */ + ret = qca8k_reg_set(priv, QCA8K_REG_GLOBAL_FW_CTRL0, + QCA8K_GLOBAL_FW_CTRL0_CPU_PORT_EN); +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -100,6 +100,11 @@ + #define QCA8K_SGMII_MODE_CTRL_PHY (1 << 22) + #define QCA8K_SGMII_MODE_CTRL_MAC (2 << 22) + ++/* MAC_PWR_SEL registers */ ++#define QCA8K_REG_MAC_PWR_SEL 0x0e4 ++#define QCA8K_MAC_PWR_RGMII1_1_8V BIT(18) ++#define QCA8K_MAC_PWR_RGMII0_1_8V BIT(19) ++ + /* EEE control registers */ + #define QCA8K_REG_EEE_CTRL 0x100 + #define QCA8K_REG_EEE_CTRL_LPI_EN(_i) ((_i + 1) * 2) diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-02-dt-bindings-net-dsa-qca8k-Add-SGMII-clock-phase-prop.patch b/root/target/linux/generic/backport-5.15/747-v5.16-02-dt-bindings-net-dsa-qca8k-Add-SGMII-clock-phase-prop.patch new file mode 100755 index 00000000..bd768ec2 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-02-dt-bindings-net-dsa-qca8k-Add-SGMII-clock-phase-prop.patch @@ -0,0 +1,30 @@ +From fdbf35df9c091db9c46e57e9938e3f7a4f603a7c Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:07 +0200 +Subject: dt-bindings: net: dsa: qca8k: Add SGMII clock phase properties + +Add names and descriptions of additional PORT0_PAD_CTRL properties. +qca,sgmii-(rx|tx)clk-falling-edge are for setting the respective clock +phase to failling edge. + +Co-developed-by: Matthew Hagan +Signed-off-by: Matthew Hagan +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + Documentation/devicetree/bindings/net/dsa/qca8k.txt | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt ++++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt +@@ -37,6 +37,10 @@ A CPU port node has the following option + managed entity. See + Documentation/devicetree/bindings/net/fixed-link.txt + for details. ++- qca,sgmii-rxclk-falling-edge: Set the receive clock phase to falling edge. ++ Mostly used in qca8327 with CPU port 0 set to ++ sgmii. ++- qca,sgmii-txclk-falling-edge: Set the transmit clock phase to falling edge. + + For QCA8K the 'fixed-link' sub-node supports only the following properties: + diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-03-net-dsa-qca8k-add-support-for-sgmii-falling-edge.patch b/root/target/linux/generic/backport-5.15/747-v5.16-03-net-dsa-qca8k-add-support-for-sgmii-falling-edge.patch new file mode 100755 index 00000000..e464452d --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-03-net-dsa-qca8k-add-support-for-sgmii-falling-edge.patch @@ -0,0 +1,127 @@ +From 6c43809bf1bee76c434e365a26546a92a5fbec14 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:08 +0200 +Subject: net: dsa: qca8k: add support for sgmii falling edge + +Add support for this in the qca8k driver. Also add support for SGMII +rx/tx clock falling edge. This is only present for pad0, pad5 and +pad6 have these bit reserved from Documentation. Add a comment that this +is hardcoded to PAD0 as qca8327/28/34/37 have an unique sgmii line and +setting falling in port0 applies to both configuration with sgmii used +for port0 or port6. + +Co-developed-by: Matthew Hagan +Signed-off-by: Matthew Hagan +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ + drivers/net/dsa/qca8k.h | 4 ++++ + 2 files changed, 67 insertions(+) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -978,6 +978,42 @@ qca8k_setup_mac_pwr_sel(struct qca8k_pri + } + + static int ++qca8k_parse_port_config(struct qca8k_priv *priv) ++{ ++ struct device_node *port_dn; ++ phy_interface_t mode; ++ struct dsa_port *dp; ++ int port, ret; ++ ++ /* We have 2 CPU port. Check them */ ++ for (port = 0; port < QCA8K_NUM_PORTS; port++) { ++ /* Skip every other port */ ++ if (port != 0 && port != 6) ++ continue; ++ ++ dp = dsa_to_port(priv->ds, port); ++ port_dn = dp->dn; ++ ++ if (!of_device_is_available(port_dn)) ++ continue; ++ ++ ret = of_get_phy_mode(port_dn, &mode); ++ if (ret) ++ continue; ++ ++ if (mode == PHY_INTERFACE_MODE_SGMII) { ++ if (of_property_read_bool(port_dn, "qca,sgmii-txclk-falling-edge")) ++ priv->sgmii_tx_clk_falling_edge = true; ++ ++ if (of_property_read_bool(port_dn, "qca,sgmii-rxclk-falling-edge")) ++ priv->sgmii_rx_clk_falling_edge = true; ++ } ++ } ++ ++ return 0; ++} ++ ++static int + qca8k_setup(struct dsa_switch *ds) + { + struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; +@@ -990,6 +1026,11 @@ qca8k_setup(struct dsa_switch *ds) + return -EINVAL; + } + ++ /* Parse CPU port config to be later used in phy_link mac_config */ ++ ret = qca8k_parse_port_config(priv); ++ if (ret) ++ return ret; ++ + mutex_init(&priv->reg_mutex); + + /* Start by setting up the register mapping */ +@@ -1274,6 +1315,28 @@ qca8k_phylink_mac_config(struct dsa_swit + } + + qca8k_write(priv, QCA8K_REG_SGMII_CTRL, val); ++ ++ /* For qca8327/qca8328/qca8334/qca8338 sgmii is unique and ++ * falling edge is set writing in the PORT0 PAD reg ++ */ ++ if (priv->switch_id == QCA8K_ID_QCA8327 || ++ priv->switch_id == QCA8K_ID_QCA8337) ++ reg = QCA8K_REG_PORT0_PAD_CTRL; ++ ++ val = 0; ++ ++ /* SGMII Clock phase configuration */ ++ if (priv->sgmii_rx_clk_falling_edge) ++ val |= QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE; ++ ++ if (priv->sgmii_tx_clk_falling_edge) ++ val |= QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE; ++ ++ if (val) ++ ret = qca8k_rmw(priv, reg, ++ QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE | ++ QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE, ++ val); + break; + default: + dev_err(ds->dev, "xMII mode %s not supported for port %d\n", +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -35,6 +35,8 @@ + #define QCA8K_MASK_CTRL_DEVICE_ID_MASK GENMASK(15, 8) + #define QCA8K_MASK_CTRL_DEVICE_ID(x) ((x) >> 8) + #define QCA8K_REG_PORT0_PAD_CTRL 0x004 ++#define QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE BIT(19) ++#define QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE BIT(18) + #define QCA8K_REG_PORT5_PAD_CTRL 0x008 + #define QCA8K_REG_PORT6_PAD_CTRL 0x00c + #define QCA8K_PORT_PAD_RGMII_EN BIT(26) +@@ -260,6 +262,8 @@ struct qca8k_priv { + u8 switch_revision; + u8 rgmii_tx_delay; + u8 rgmii_rx_delay; ++ bool sgmii_rx_clk_falling_edge; ++ bool sgmii_tx_clk_falling_edge; + bool legacy_phy_port_mapping; + struct regmap *regmap; + struct mii_bus *bus; diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-04-dt-bindings-net-dsa-qca8k-Document-support-for-CPU-p.patch b/root/target/linux/generic/backport-5.15/747-v5.16-04-dt-bindings-net-dsa-qca8k-Document-support-for-CPU-p.patch new file mode 100755 index 00000000..606ac0af --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-04-dt-bindings-net-dsa-qca8k-Document-support-for-CPU-p.patch @@ -0,0 +1,29 @@ +From 731d613338ec6de482053ffa3f71be2325b0f8eb Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:09 +0200 +Subject: dt-bindings: net: dsa: qca8k: Document support for CPU port 6 + +The switch now support CPU port to be set 6 instead of be hardcoded to +0. Document support for it and describe logic selection. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + Documentation/devicetree/bindings/net/dsa/qca8k.txt | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt ++++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt +@@ -29,7 +29,11 @@ the mdio MASTER is used as communication + Don't use mixed external and internal mdio-bus configurations, as this is + not supported by the hardware. + +-The CPU port of this switch is always port 0. ++This switch support 2 CPU port. Normally and advised configuration is with ++CPU port set to port 0. It is also possible to set the CPU port to port 6 ++if the device requires it. The driver will configure the switch to the defined ++port. With both CPU port declared the first CPU port is selected as primary ++and the secondary CPU ignored. + + A CPU port node has the following optional node: + diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-05-net-dsa-qca8k-add-support-for-cpu-port-6.patch b/root/target/linux/generic/backport-5.15/747-v5.16-05-net-dsa-qca8k-add-support-for-cpu-port-6.patch new file mode 100755 index 00000000..320db8fa --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-05-net-dsa-qca8k-add-support-for-cpu-port-6.patch @@ -0,0 +1,153 @@ +From 3fcf734aa482487df83cf8f18608438fcf59127f Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:10 +0200 +Subject: net: dsa: qca8k: add support for cpu port 6 + +Currently CPU port is always hardcoded to port 0. This switch have 2 CPU +ports. The original intention of this driver seems to be use the +mac06_exchange bit to swap MAC0 with MAC6 in the strange configuration +where device have connected only the CPU port 6. To skip the +introduction of a new binding, rework the driver to address the +secondary CPU port as primary and drop any reference of hardcoded port. +With configuration of mac06 exchange, just skip the definition of port0 +and define the CPU port as a secondary. The driver will autoconfigure +the switch to use that as the primary CPU port. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 51 ++++++++++++++++++++++++++++++++++--------------- + drivers/net/dsa/qca8k.h | 2 -- + 2 files changed, 36 insertions(+), 17 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -977,6 +977,22 @@ qca8k_setup_mac_pwr_sel(struct qca8k_pri + return ret; + } + ++static int qca8k_find_cpu_port(struct dsa_switch *ds) ++{ ++ struct qca8k_priv *priv = ds->priv; ++ ++ /* Find the connected cpu port. Valid port are 0 or 6 */ ++ if (dsa_is_cpu_port(ds, 0)) ++ return 0; ++ ++ dev_dbg(priv->dev, "port 0 is not the CPU port. Checking port 6"); ++ ++ if (dsa_is_cpu_port(ds, 6)) ++ return 6; ++ ++ return -EINVAL; ++} ++ + static int + qca8k_parse_port_config(struct qca8k_priv *priv) + { +@@ -1017,13 +1033,13 @@ static int + qca8k_setup(struct dsa_switch *ds) + { + struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; +- int ret, i; ++ int cpu_port, ret, i; + u32 mask; + +- /* Make sure that port 0 is the cpu port */ +- if (!dsa_is_cpu_port(ds, 0)) { +- dev_err(priv->dev, "port 0 is not the CPU port"); +- return -EINVAL; ++ cpu_port = qca8k_find_cpu_port(ds); ++ if (cpu_port < 0) { ++ dev_err(priv->dev, "No cpu port configured in both cpu port0 and port6"); ++ return cpu_port; + } + + /* Parse CPU port config to be later used in phy_link mac_config */ +@@ -1065,7 +1081,7 @@ qca8k_setup(struct dsa_switch *ds) + dev_warn(priv->dev, "mib init failed"); + + /* Enable QCA header mode on the cpu port */ +- ret = qca8k_write(priv, QCA8K_REG_PORT_HDR_CTRL(QCA8K_CPU_PORT), ++ ret = qca8k_write(priv, QCA8K_REG_PORT_HDR_CTRL(cpu_port), + QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_TX_S | + QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_RX_S); + if (ret) { +@@ -1087,10 +1103,10 @@ qca8k_setup(struct dsa_switch *ds) + + /* Forward all unknown frames to CPU port for Linux processing */ + ret = qca8k_write(priv, QCA8K_REG_GLOBAL_FW_CTRL1, +- BIT(0) << QCA8K_GLOBAL_FW_CTRL1_IGMP_DP_S | +- BIT(0) << QCA8K_GLOBAL_FW_CTRL1_BC_DP_S | +- BIT(0) << QCA8K_GLOBAL_FW_CTRL1_MC_DP_S | +- BIT(0) << QCA8K_GLOBAL_FW_CTRL1_UC_DP_S); ++ BIT(cpu_port) << QCA8K_GLOBAL_FW_CTRL1_IGMP_DP_S | ++ BIT(cpu_port) << QCA8K_GLOBAL_FW_CTRL1_BC_DP_S | ++ BIT(cpu_port) << QCA8K_GLOBAL_FW_CTRL1_MC_DP_S | ++ BIT(cpu_port) << QCA8K_GLOBAL_FW_CTRL1_UC_DP_S); + if (ret) + return ret; + +@@ -1098,7 +1114,7 @@ qca8k_setup(struct dsa_switch *ds) + for (i = 0; i < QCA8K_NUM_PORTS; i++) { + /* CPU port gets connected to all user ports of the switch */ + if (dsa_is_cpu_port(ds, i)) { +- ret = qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(QCA8K_CPU_PORT), ++ ret = qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(cpu_port), + QCA8K_PORT_LOOKUP_MEMBER, dsa_user_ports(ds)); + if (ret) + return ret; +@@ -1110,7 +1126,7 @@ qca8k_setup(struct dsa_switch *ds) + + ret = qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(i), + QCA8K_PORT_LOOKUP_MEMBER, +- BIT(QCA8K_CPU_PORT)); ++ BIT(cpu_port)); + if (ret) + return ret; + +@@ -1616,9 +1632,12 @@ static int + qca8k_port_bridge_join(struct dsa_switch *ds, int port, struct net_device *br) + { + struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; +- int port_mask = BIT(QCA8K_CPU_PORT); ++ int port_mask, cpu_port; + int i, ret; + ++ cpu_port = dsa_to_port(ds, port)->cpu_dp->index; ++ port_mask = BIT(cpu_port); ++ + for (i = 1; i < QCA8K_NUM_PORTS; i++) { + if (dsa_to_port(ds, i)->bridge_dev != br) + continue; +@@ -1645,7 +1664,9 @@ static void + qca8k_port_bridge_leave(struct dsa_switch *ds, int port, struct net_device *br) + { + struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; +- int i; ++ int cpu_port, i; ++ ++ cpu_port = dsa_to_port(ds, port)->cpu_dp->index; + + for (i = 1; i < QCA8K_NUM_PORTS; i++) { + if (dsa_to_port(ds, i)->bridge_dev != br) +@@ -1662,7 +1683,7 @@ qca8k_port_bridge_leave(struct dsa_switc + * this port + */ + qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(port), +- QCA8K_PORT_LOOKUP_MEMBER, BIT(QCA8K_CPU_PORT)); ++ QCA8K_PORT_LOOKUP_MEMBER, BIT(cpu_port)); + } + + static int +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -24,8 +24,6 @@ + + #define QCA8K_NUM_FDB_RECORDS 2048 + +-#define QCA8K_CPU_PORT 0 +- + #define QCA8K_PORT_VID_DEF 1 + + /* Global control registers */ diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-06-net-dsa-qca8k-rework-rgmii-delay-logic-and-scan-for-.patch b/root/target/linux/generic/backport-5.15/747-v5.16-06-net-dsa-qca8k-rework-rgmii-delay-logic-and-scan-for-.patch new file mode 100755 index 00000000..de201764 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-06-net-dsa-qca8k-rework-rgmii-delay-logic-and-scan-for-.patch @@ -0,0 +1,295 @@ +From 5654ec78dd7e64b1e04777b24007344329e6a63b Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:11 +0200 +Subject: net: dsa: qca8k: rework rgmii delay logic and scan for cpu port 6 + +Future proof commit. This switch have 2 CPU ports and one valid +configuration is first CPU port set to sgmii and second CPU port set to +rgmii-id. The current implementation detects delay only for CPU port +zero set to rgmii and doesn't count any delay set in a secondary CPU +port. Drop the current delay scan function and move it to the sgmii +parser function to generalize and implicitly add support for secondary +CPU port set to rgmii-id. Introduce new logic where delay is enabled +also with internal delay binding declared and rgmii set as PHY mode. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 165 ++++++++++++++++++++++++------------------------ + drivers/net/dsa/qca8k.h | 10 ++- + 2 files changed, 89 insertions(+), 86 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -889,68 +889,6 @@ qca8k_setup_mdio_bus(struct qca8k_priv * + } + + static int +-qca8k_setup_of_rgmii_delay(struct qca8k_priv *priv) +-{ +- struct device_node *port_dn; +- phy_interface_t mode; +- struct dsa_port *dp; +- u32 val; +- +- /* CPU port is already checked */ +- dp = dsa_to_port(priv->ds, 0); +- +- port_dn = dp->dn; +- +- /* Check if port 0 is set to the correct type */ +- of_get_phy_mode(port_dn, &mode); +- if (mode != PHY_INTERFACE_MODE_RGMII_ID && +- mode != PHY_INTERFACE_MODE_RGMII_RXID && +- mode != PHY_INTERFACE_MODE_RGMII_TXID) { +- return 0; +- } +- +- switch (mode) { +- case PHY_INTERFACE_MODE_RGMII_ID: +- case PHY_INTERFACE_MODE_RGMII_RXID: +- if (of_property_read_u32(port_dn, "rx-internal-delay-ps", &val)) +- val = 2; +- else +- /* Switch regs accept value in ns, convert ps to ns */ +- val = val / 1000; +- +- if (val > QCA8K_MAX_DELAY) { +- dev_err(priv->dev, "rgmii rx delay is limited to a max value of 3ns, setting to the max value"); +- val = 3; +- } +- +- priv->rgmii_rx_delay = val; +- /* Stop here if we need to check only for rx delay */ +- if (mode != PHY_INTERFACE_MODE_RGMII_ID) +- break; +- +- fallthrough; +- case PHY_INTERFACE_MODE_RGMII_TXID: +- if (of_property_read_u32(port_dn, "tx-internal-delay-ps", &val)) +- val = 1; +- else +- /* Switch regs accept value in ns, convert ps to ns */ +- val = val / 1000; +- +- if (val > QCA8K_MAX_DELAY) { +- dev_err(priv->dev, "rgmii tx delay is limited to a max value of 3ns, setting to the max value"); +- val = 3; +- } +- +- priv->rgmii_tx_delay = val; +- break; +- default: +- return 0; +- } +- +- return 0; +-} +- +-static int + qca8k_setup_mac_pwr_sel(struct qca8k_priv *priv) + { + u32 mask = 0; +@@ -996,19 +934,21 @@ static int qca8k_find_cpu_port(struct ds + static int + qca8k_parse_port_config(struct qca8k_priv *priv) + { ++ int port, cpu_port_index = 0, ret; + struct device_node *port_dn; + phy_interface_t mode; + struct dsa_port *dp; +- int port, ret; ++ u32 delay; + + /* We have 2 CPU port. Check them */ +- for (port = 0; port < QCA8K_NUM_PORTS; port++) { ++ for (port = 0; port < QCA8K_NUM_PORTS && cpu_port_index < QCA8K_NUM_CPU_PORTS; port++) { + /* Skip every other port */ + if (port != 0 && port != 6) + continue; + + dp = dsa_to_port(priv->ds, port); + port_dn = dp->dn; ++ cpu_port_index++; + + if (!of_device_is_available(port_dn)) + continue; +@@ -1017,12 +957,54 @@ qca8k_parse_port_config(struct qca8k_pri + if (ret) + continue; + +- if (mode == PHY_INTERFACE_MODE_SGMII) { ++ switch (mode) { ++ case PHY_INTERFACE_MODE_RGMII: ++ case PHY_INTERFACE_MODE_RGMII_ID: ++ case PHY_INTERFACE_MODE_RGMII_TXID: ++ case PHY_INTERFACE_MODE_RGMII_RXID: ++ delay = 0; ++ ++ if (!of_property_read_u32(port_dn, "tx-internal-delay-ps", &delay)) ++ /* Switch regs accept value in ns, convert ps to ns */ ++ delay = delay / 1000; ++ else if (mode == PHY_INTERFACE_MODE_RGMII_ID || ++ mode == PHY_INTERFACE_MODE_RGMII_TXID) ++ delay = 1; ++ ++ if (delay > QCA8K_MAX_DELAY) { ++ dev_err(priv->dev, "rgmii tx delay is limited to a max value of 3ns, setting to the max value"); ++ delay = 3; ++ } ++ ++ priv->rgmii_tx_delay[cpu_port_index] = delay; ++ ++ delay = 0; ++ ++ if (!of_property_read_u32(port_dn, "rx-internal-delay-ps", &delay)) ++ /* Switch regs accept value in ns, convert ps to ns */ ++ delay = delay / 1000; ++ else if (mode == PHY_INTERFACE_MODE_RGMII_ID || ++ mode == PHY_INTERFACE_MODE_RGMII_RXID) ++ delay = 2; ++ ++ if (delay > QCA8K_MAX_DELAY) { ++ dev_err(priv->dev, "rgmii rx delay is limited to a max value of 3ns, setting to the max value"); ++ delay = 3; ++ } ++ ++ priv->rgmii_rx_delay[cpu_port_index] = delay; ++ ++ break; ++ case PHY_INTERFACE_MODE_SGMII: + if (of_property_read_bool(port_dn, "qca,sgmii-txclk-falling-edge")) + priv->sgmii_tx_clk_falling_edge = true; + + if (of_property_read_bool(port_dn, "qca,sgmii-rxclk-falling-edge")) + priv->sgmii_rx_clk_falling_edge = true; ++ ++ break; ++ default: ++ continue; + } + } + +@@ -1059,10 +1041,6 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + return ret; + +- ret = qca8k_setup_of_rgmii_delay(priv); +- if (ret) +- return ret; +- + ret = qca8k_setup_mac_pwr_sel(priv); + if (ret) + return ret; +@@ -1229,8 +1207,8 @@ qca8k_phylink_mac_config(struct dsa_swit + const struct phylink_link_state *state) + { + struct qca8k_priv *priv = ds->priv; +- u32 reg, val; +- int ret; ++ int cpu_port_index, ret; ++ u32 reg, val, delay; + + switch (port) { + case 0: /* 1st CPU port */ +@@ -1242,6 +1220,7 @@ qca8k_phylink_mac_config(struct dsa_swit + return; + + reg = QCA8K_REG_PORT0_PAD_CTRL; ++ cpu_port_index = QCA8K_CPU_PORT0; + break; + case 1: + case 2: +@@ -1260,6 +1239,7 @@ qca8k_phylink_mac_config(struct dsa_swit + return; + + reg = QCA8K_REG_PORT6_PAD_CTRL; ++ cpu_port_index = QCA8K_CPU_PORT6; + break; + default: + dev_err(ds->dev, "%s: unsupported port: %i\n", __func__, port); +@@ -1274,23 +1254,40 @@ qca8k_phylink_mac_config(struct dsa_swit + + switch (state->interface) { + case PHY_INTERFACE_MODE_RGMII: +- /* RGMII mode means no delay so don't enable the delay */ +- qca8k_write(priv, reg, QCA8K_PORT_PAD_RGMII_EN); +- break; + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_TXID: + case PHY_INTERFACE_MODE_RGMII_RXID: +- /* RGMII_ID needs internal delay. This is enabled through +- * PORT5_PAD_CTRL for all ports, rather than individual port +- * registers ++ val = QCA8K_PORT_PAD_RGMII_EN; ++ ++ /* Delay can be declared in 3 different way. ++ * Mode to rgmii and internal-delay standard binding defined ++ * rgmii-id or rgmii-tx/rx phy mode set. ++ * The parse logic set a delay different than 0 only when one ++ * of the 3 different way is used. In all other case delay is ++ * not enabled. With ID or TX/RXID delay is enabled and set ++ * to the default and recommended value. ++ */ ++ if (priv->rgmii_tx_delay[cpu_port_index]) { ++ delay = priv->rgmii_tx_delay[cpu_port_index]; ++ ++ val |= QCA8K_PORT_PAD_RGMII_TX_DELAY(delay) | ++ QCA8K_PORT_PAD_RGMII_TX_DELAY_EN; ++ } ++ ++ if (priv->rgmii_rx_delay[cpu_port_index]) { ++ delay = priv->rgmii_rx_delay[cpu_port_index]; ++ ++ val |= QCA8K_PORT_PAD_RGMII_RX_DELAY(delay) | ++ QCA8K_PORT_PAD_RGMII_RX_DELAY_EN; ++ } ++ ++ /* Set RGMII delay based on the selected values */ ++ qca8k_write(priv, reg, val); ++ ++ /* QCA8337 requires to set rgmii rx delay for all ports. ++ * This is enabled through PORT5_PAD_CTRL for all ports, ++ * rather than individual port registers. + */ +- qca8k_write(priv, reg, +- QCA8K_PORT_PAD_RGMII_EN | +- QCA8K_PORT_PAD_RGMII_TX_DELAY(priv->rgmii_tx_delay) | +- QCA8K_PORT_PAD_RGMII_RX_DELAY(priv->rgmii_rx_delay) | +- QCA8K_PORT_PAD_RGMII_TX_DELAY_EN | +- QCA8K_PORT_PAD_RGMII_RX_DELAY_EN); +- /* QCA8337 requires to set rgmii rx delay */ + if (priv->switch_id == QCA8K_ID_QCA8337) + qca8k_write(priv, QCA8K_REG_PORT5_PAD_CTRL, + QCA8K_PORT_PAD_RGMII_RX_DELAY_EN); +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -13,6 +13,7 @@ + #include + + #define QCA8K_NUM_PORTS 7 ++#define QCA8K_NUM_CPU_PORTS 2 + #define QCA8K_MAX_MTU 9000 + + #define PHY_ID_QCA8327 0x004dd034 +@@ -255,13 +256,18 @@ struct qca8k_match_data { + u8 id; + }; + ++enum { ++ QCA8K_CPU_PORT0, ++ QCA8K_CPU_PORT6, ++}; ++ + struct qca8k_priv { + u8 switch_id; + u8 switch_revision; +- u8 rgmii_tx_delay; +- u8 rgmii_rx_delay; + bool sgmii_rx_clk_falling_edge; + bool sgmii_tx_clk_falling_edge; ++ u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */ ++ u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */ + bool legacy_phy_port_mapping; + struct regmap *regmap; + struct mii_bus *bus; diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-07-dt-bindings-net-dsa-qca8k-Document-qca-sgmii-enable-.patch b/root/target/linux/generic/backport-5.15/747-v5.16-07-dt-bindings-net-dsa-qca8k-Document-qca-sgmii-enable-.patch new file mode 100755 index 00000000..8abd264e --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-07-dt-bindings-net-dsa-qca8k-Document-qca-sgmii-enable-.patch @@ -0,0 +1,33 @@ +From 13ad5ccc093ff448b99ac7e138e91e78796adb48 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:12 +0200 +Subject: dt-bindings: net: dsa: qca8k: Document qca,sgmii-enable-pll + +Document qca,sgmii-enable-pll binding used in the CPU nodes to +enable SGMII PLL on MAC config. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + Documentation/devicetree/bindings/net/dsa/qca8k.txt | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt ++++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt +@@ -45,6 +45,16 @@ A CPU port node has the following option + Mostly used in qca8327 with CPU port 0 set to + sgmii. + - qca,sgmii-txclk-falling-edge: Set the transmit clock phase to falling edge. ++- qca,sgmii-enable-pll : For SGMII CPU port, explicitly enable PLL, TX and RX ++ chain along with Signal Detection. ++ This should NOT be enabled for qca8327. If enabled with ++ qca8327 the sgmii port won't correctly init and an err ++ is printed. ++ This can be required for qca8337 switch with revision 2. ++ A warning is displayed when used with revision greater ++ 2. ++ With CPU port set to sgmii and qca8337 it is advised ++ to set this unless a communication problem is observed. + + For QCA8K the 'fixed-link' sub-node supports only the following properties: + diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-08-net-dsa-qca8k-add-explicit-SGMII-PLL-enable.patch b/root/target/linux/generic/backport-5.15/747-v5.16-08-net-dsa-qca8k-add-explicit-SGMII-PLL-enable.patch new file mode 100755 index 00000000..2b5a84a1 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-08-net-dsa-qca8k-add-explicit-SGMII-PLL-enable.patch @@ -0,0 +1,65 @@ +From bbc4799e8bb6c397e3b3fec13de68e179f5db9ff Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:13 +0200 +Subject: net: dsa: qca8k: add explicit SGMII PLL enable + +Support enabling PLL on the SGMII CPU port. Some device require this +special configuration or no traffic is transmitted and the switch +doesn't work at all. A dedicated binding is added to the CPU node +port to apply the correct reg on mac config. +Fail to correctly configure sgmii with qca8327 switch and warn if pll is +used on qca8337 with a revision greater than 1. + +Signed-off-by: Ansuel Smith +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 19 +++++++++++++++++-- + drivers/net/dsa/qca8k.h | 1 + + 2 files changed, 18 insertions(+), 2 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -1002,6 +1002,18 @@ qca8k_parse_port_config(struct qca8k_pri + if (of_property_read_bool(port_dn, "qca,sgmii-rxclk-falling-edge")) + priv->sgmii_rx_clk_falling_edge = true; + ++ if (of_property_read_bool(port_dn, "qca,sgmii-enable-pll")) { ++ priv->sgmii_enable_pll = true; ++ ++ if (priv->switch_id == QCA8K_ID_QCA8327) { ++ dev_err(priv->dev, "SGMII PLL should NOT be enabled for qca8327. Aborting enabling"); ++ priv->sgmii_enable_pll = false; ++ } ++ ++ if (priv->switch_revision < 2) ++ dev_warn(priv->dev, "SGMII PLL should NOT be enabled for qca8337 with revision 2 or more."); ++ } ++ + break; + default: + continue; +@@ -1312,8 +1324,11 @@ qca8k_phylink_mac_config(struct dsa_swit + if (ret) + return; + +- val |= QCA8K_SGMII_EN_PLL | QCA8K_SGMII_EN_RX | +- QCA8K_SGMII_EN_TX | QCA8K_SGMII_EN_SD; ++ val |= QCA8K_SGMII_EN_SD; ++ ++ if (priv->sgmii_enable_pll) ++ val |= QCA8K_SGMII_EN_PLL | QCA8K_SGMII_EN_RX | ++ QCA8K_SGMII_EN_TX; + + if (dsa_is_cpu_port(ds, port)) { + /* CPU port, we're talking to the CPU MAC, be a PHY */ +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -266,6 +266,7 @@ struct qca8k_priv { + u8 switch_revision; + bool sgmii_rx_clk_falling_edge; + bool sgmii_tx_clk_falling_edge; ++ bool sgmii_enable_pll; + u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */ + u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */ + bool legacy_phy_port_mapping; diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-09-dt-bindings-net-dsa-qca8k-Document-qca-led-open-drai.patch b/root/target/linux/generic/backport-5.15/747-v5.16-09-dt-bindings-net-dsa-qca8k-Document-qca-led-open-drai.patch new file mode 100755 index 00000000..38dc954e --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-09-dt-bindings-net-dsa-qca8k-Document-qca-led-open-drai.patch @@ -0,0 +1,37 @@ +From 924087c5c3d41553700b0eb83ca2a53b91643dca Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:14 +0200 +Subject: dt-bindings: net: dsa: qca8k: Document qca,led-open-drain binding + +Document new binding qca,ignore-power-on-sel used to ignore +power on strapping and use sw regs instead. +Document qca,led-open.drain to set led to open drain mode, the +qca,ignore-power-on-sel is mandatory with this enabled or an error will +be reported. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + Documentation/devicetree/bindings/net/dsa/qca8k.txt | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt ++++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt +@@ -13,6 +13,17 @@ Required properties: + Optional properties: + + - reset-gpios: GPIO to be used to reset the whole device ++- qca,ignore-power-on-sel: Ignore power on pin strapping to configure led open ++ drain or eeprom presence. This is needed for broken ++ devices that have wrong configuration or when the oem ++ decided to not use pin strapping and fallback to sw ++ regs. ++- qca,led-open-drain: Set leds to open-drain mode. This requires the ++ qca,ignore-power-on-sel to be set or the driver will fail ++ to probe. This is needed if the oem doesn't use pin ++ strapping to set this mode and prefers to set it using sw ++ regs. The pin strapping related to led open drain mode is ++ the pin B68 for QCA832x and B49 for QCA833x + + Subnodes: + diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-10-net-dsa-qca8k-add-support-for-pws-config-reg.patch b/root/target/linux/generic/backport-5.15/747-v5.16-10-net-dsa-qca8k-add-support-for-pws-config-reg.patch new file mode 100755 index 00000000..aa5d92a4 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-10-net-dsa-qca8k-add-support-for-pws-config-reg.patch @@ -0,0 +1,92 @@ +From 362bb238d8bf1470424214a8a5968d9c6cce68fa Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:15 +0200 +Subject: net: dsa: qca8k: add support for pws config reg + +Some qca8327 switch require to force the ignore of power on sel +strapping. Some switch require to set the led open drain mode in regs +instead of using strapping. While most of the device implements this +using the correct way using pin strapping, there are still some broken +device that require to be set using sw regs. +Introduce a new binding and support these special configuration. +As led open drain require to ignore pin strapping to work, the probe +fails with EINVAL error with incorrect configuration. + +Signed-off-by: Ansuel Smith +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 39 +++++++++++++++++++++++++++++++++++++++ + drivers/net/dsa/qca8k.h | 6 ++++++ + 2 files changed, 45 insertions(+) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -932,6 +932,41 @@ static int qca8k_find_cpu_port(struct ds + } + + static int ++qca8k_setup_of_pws_reg(struct qca8k_priv *priv) ++{ ++ struct device_node *node = priv->dev->of_node; ++ u32 val = 0; ++ int ret; ++ ++ /* QCA8327 require to set to the correct mode. ++ * His bigger brother QCA8328 have the 172 pin layout. ++ * Should be applied by default but we set this just to make sure. ++ */ ++ if (priv->switch_id == QCA8K_ID_QCA8327) { ++ ret = qca8k_rmw(priv, QCA8K_REG_PWS, QCA8327_PWS_PACKAGE148_EN, ++ QCA8327_PWS_PACKAGE148_EN); ++ if (ret) ++ return ret; ++ } ++ ++ if (of_property_read_bool(node, "qca,ignore-power-on-sel")) ++ val |= QCA8K_PWS_POWER_ON_SEL; ++ ++ if (of_property_read_bool(node, "qca,led-open-drain")) { ++ if (!(val & QCA8K_PWS_POWER_ON_SEL)) { ++ dev_err(priv->dev, "qca,led-open-drain require qca,ignore-power-on-sel to be set."); ++ return -EINVAL; ++ } ++ ++ val |= QCA8K_PWS_LED_OPEN_EN_CSR; ++ } ++ ++ return qca8k_rmw(priv, QCA8K_REG_PWS, ++ QCA8K_PWS_LED_OPEN_EN_CSR | QCA8K_PWS_POWER_ON_SEL, ++ val); ++} ++ ++static int + qca8k_parse_port_config(struct qca8k_priv *priv) + { + int port, cpu_port_index = 0, ret; +@@ -1053,6 +1088,10 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + return ret; + ++ ret = qca8k_setup_of_pws_reg(priv); ++ if (ret) ++ return ret; ++ + ret = qca8k_setup_mac_pwr_sel(priv); + if (ret) + return ret; +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -46,6 +46,12 @@ + #define QCA8K_MAX_DELAY 3 + #define QCA8K_PORT_PAD_SGMII_EN BIT(7) + #define QCA8K_REG_PWS 0x010 ++#define QCA8K_PWS_POWER_ON_SEL BIT(31) ++/* This reg is only valid for QCA832x and toggle the package ++ * type from 176 pin (by default) to 148 pin used on QCA8327 ++ */ ++#define QCA8327_PWS_PACKAGE148_EN BIT(30) ++#define QCA8K_PWS_LED_OPEN_EN_CSR BIT(24) + #define QCA8K_PWS_SERDES_AEN_DIS BIT(7) + #define QCA8K_REG_MODULE_EN 0x030 + #define QCA8K_MODULE_EN_MIB BIT(0) diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-11-dt-bindings-net-dsa-qca8k-document-support-for-qca83.patch b/root/target/linux/generic/backport-5.15/747-v5.16-11-dt-bindings-net-dsa-qca8k-document-support-for-qca83.patch new file mode 100755 index 00000000..1bfb00c5 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-11-dt-bindings-net-dsa-qca8k-document-support-for-qca83.patch @@ -0,0 +1,32 @@ +From ed7988d77fbfb79366b68f9e7fa60a6080da23d4 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:16 +0200 +Subject: dt-bindings: net: dsa: qca8k: document support for qca8328 + +QCA8328 is the bigger brother of qca8327. Document the new compatible +binding and add some information to understand the various switch +compatible. + +Signed-off-by: Ansuel Smith +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +--- + Documentation/devicetree/bindings/net/dsa/qca8k.txt | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt ++++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt +@@ -3,9 +3,10 @@ + Required properties: + + - compatible: should be one of: +- "qca,qca8327" +- "qca,qca8334" +- "qca,qca8337" ++ "qca,qca8328": referenced as AR8328(N)-AK1(A/B) QFN 176 pin package ++ "qca,qca8327": referenced as AR8327(N)-AL1A DR-QFN 148 pin package ++ "qca,qca8334": referenced as QCA8334-AL3C QFN 88 pin package ++ "qca,qca8337": referenced as QCA8337N-AL3(B/C) DR-QFN 148 pin package + + - #size-cells: must be 0 + - #address-cells: must be 1 diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-12-net-dsa-qca8k-add-support-for-QCA8328.patch b/root/target/linux/generic/backport-5.15/747-v5.16-12-net-dsa-qca8k-add-support-for-QCA8328.patch new file mode 100755 index 00000000..70f227fb --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-12-net-dsa-qca8k-add-support-for-QCA8328.patch @@ -0,0 +1,78 @@ +From f477d1c8bdbef4f400718238e350f16f521d2a3e Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:17 +0200 +Subject: net: dsa: qca8k: add support for QCA8328 + +QCA8328 switch is the bigger brother of the qca8327. Same regs different +chip. Change the function to set the correct pin layout and introduce a +new match_data to differentiate the 2 switch as they have the same ID +and their internal PHY have the same ID. + +Signed-off-by: Ansuel Smith +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 19 ++++++++++++++++--- + drivers/net/dsa/qca8k.h | 1 + + 2 files changed, 17 insertions(+), 3 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -935,6 +935,7 @@ static int + qca8k_setup_of_pws_reg(struct qca8k_priv *priv) + { + struct device_node *node = priv->dev->of_node; ++ const struct qca8k_match_data *data; + u32 val = 0; + int ret; + +@@ -943,8 +944,14 @@ qca8k_setup_of_pws_reg(struct qca8k_priv + * Should be applied by default but we set this just to make sure. + */ + if (priv->switch_id == QCA8K_ID_QCA8327) { ++ data = of_device_get_match_data(priv->dev); ++ ++ /* Set the correct package of 148 pin for QCA8327 */ ++ if (data->reduced_package) ++ val |= QCA8327_PWS_PACKAGE148_EN; ++ + ret = qca8k_rmw(priv, QCA8K_REG_PWS, QCA8327_PWS_PACKAGE148_EN, +- QCA8327_PWS_PACKAGE148_EN); ++ val); + if (ret) + return ret; + } +@@ -2105,7 +2112,12 @@ static int qca8k_resume(struct device *d + static SIMPLE_DEV_PM_OPS(qca8k_pm_ops, + qca8k_suspend, qca8k_resume); + +-static const struct qca8k_match_data qca832x = { ++static const struct qca8k_match_data qca8327 = { ++ .id = QCA8K_ID_QCA8327, ++ .reduced_package = true, ++}; ++ ++static const struct qca8k_match_data qca8328 = { + .id = QCA8K_ID_QCA8327, + }; + +@@ -2114,7 +2126,8 @@ static const struct qca8k_match_data qca + }; + + static const struct of_device_id qca8k_of_match[] = { +- { .compatible = "qca,qca8327", .data = &qca832x }, ++ { .compatible = "qca,qca8327", .data = &qca8327 }, ++ { .compatible = "qca,qca8328", .data = &qca8328 }, + { .compatible = "qca,qca8334", .data = &qca833x }, + { .compatible = "qca,qca8337", .data = &qca833x }, + { /* sentinel */ }, +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -260,6 +260,7 @@ struct ar8xxx_port_status { + + struct qca8k_match_data { + u8 id; ++ bool reduced_package; + }; + + enum { diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-13-net-dsa-qca8k-set-internal-delay-also-for-sgmii.patch b/root/target/linux/generic/backport-5.15/747-v5.16-13-net-dsa-qca8k-set-internal-delay-also-for-sgmii.patch new file mode 100755 index 00000000..27f94dca --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-13-net-dsa-qca8k-set-internal-delay-also-for-sgmii.patch @@ -0,0 +1,159 @@ +From cef08115846e581f80ff99abf7bf218da1840616 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:18 +0200 +Subject: net: dsa: qca8k: set internal delay also for sgmii + +QCA original code report port instability and sa that SGMII also require +to set internal delay. Generalize the rgmii delay function and apply the +advised value if they are not defined in DT. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 88 +++++++++++++++++++++++++++++++++---------------- + drivers/net/dsa/qca8k.h | 2 ++ + 2 files changed, 62 insertions(+), 28 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -1004,6 +1004,7 @@ qca8k_parse_port_config(struct qca8k_pri + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_TXID: + case PHY_INTERFACE_MODE_RGMII_RXID: ++ case PHY_INTERFACE_MODE_SGMII: + delay = 0; + + if (!of_property_read_u32(port_dn, "tx-internal-delay-ps", &delay)) +@@ -1036,8 +1037,13 @@ qca8k_parse_port_config(struct qca8k_pri + + priv->rgmii_rx_delay[cpu_port_index] = delay; + +- break; +- case PHY_INTERFACE_MODE_SGMII: ++ /* Skip sgmii parsing for rgmii* mode */ ++ if (mode == PHY_INTERFACE_MODE_RGMII || ++ mode == PHY_INTERFACE_MODE_RGMII_ID || ++ mode == PHY_INTERFACE_MODE_RGMII_TXID || ++ mode == PHY_INTERFACE_MODE_RGMII_RXID) ++ break; ++ + if (of_property_read_bool(port_dn, "qca,sgmii-txclk-falling-edge")) + priv->sgmii_tx_clk_falling_edge = true; + +@@ -1261,12 +1267,53 @@ qca8k_setup(struct dsa_switch *ds) + } + + static void ++qca8k_mac_config_setup_internal_delay(struct qca8k_priv *priv, int cpu_port_index, ++ u32 reg) ++{ ++ u32 delay, val = 0; ++ int ret; ++ ++ /* Delay can be declared in 3 different way. ++ * Mode to rgmii and internal-delay standard binding defined ++ * rgmii-id or rgmii-tx/rx phy mode set. ++ * The parse logic set a delay different than 0 only when one ++ * of the 3 different way is used. In all other case delay is ++ * not enabled. With ID or TX/RXID delay is enabled and set ++ * to the default and recommended value. ++ */ ++ if (priv->rgmii_tx_delay[cpu_port_index]) { ++ delay = priv->rgmii_tx_delay[cpu_port_index]; ++ ++ val |= QCA8K_PORT_PAD_RGMII_TX_DELAY(delay) | ++ QCA8K_PORT_PAD_RGMII_TX_DELAY_EN; ++ } ++ ++ if (priv->rgmii_rx_delay[cpu_port_index]) { ++ delay = priv->rgmii_rx_delay[cpu_port_index]; ++ ++ val |= QCA8K_PORT_PAD_RGMII_RX_DELAY(delay) | ++ QCA8K_PORT_PAD_RGMII_RX_DELAY_EN; ++ } ++ ++ /* Set RGMII delay based on the selected values */ ++ ret = qca8k_rmw(priv, reg, ++ QCA8K_PORT_PAD_RGMII_TX_DELAY_MASK | ++ QCA8K_PORT_PAD_RGMII_RX_DELAY_MASK | ++ QCA8K_PORT_PAD_RGMII_TX_DELAY_EN | ++ QCA8K_PORT_PAD_RGMII_RX_DELAY_EN, ++ val); ++ if (ret) ++ dev_err(priv->dev, "Failed to set internal delay for CPU port%d", ++ cpu_port_index == QCA8K_CPU_PORT0 ? 0 : 6); ++} ++ ++static void + qca8k_phylink_mac_config(struct dsa_switch *ds, int port, unsigned int mode, + const struct phylink_link_state *state) + { + struct qca8k_priv *priv = ds->priv; + int cpu_port_index, ret; +- u32 reg, val, delay; ++ u32 reg, val; + + switch (port) { + case 0: /* 1st CPU port */ +@@ -1315,32 +1362,10 @@ qca8k_phylink_mac_config(struct dsa_swit + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_TXID: + case PHY_INTERFACE_MODE_RGMII_RXID: +- val = QCA8K_PORT_PAD_RGMII_EN; +- +- /* Delay can be declared in 3 different way. +- * Mode to rgmii and internal-delay standard binding defined +- * rgmii-id or rgmii-tx/rx phy mode set. +- * The parse logic set a delay different than 0 only when one +- * of the 3 different way is used. In all other case delay is +- * not enabled. With ID or TX/RXID delay is enabled and set +- * to the default and recommended value. +- */ +- if (priv->rgmii_tx_delay[cpu_port_index]) { +- delay = priv->rgmii_tx_delay[cpu_port_index]; +- +- val |= QCA8K_PORT_PAD_RGMII_TX_DELAY(delay) | +- QCA8K_PORT_PAD_RGMII_TX_DELAY_EN; +- } +- +- if (priv->rgmii_rx_delay[cpu_port_index]) { +- delay = priv->rgmii_rx_delay[cpu_port_index]; +- +- val |= QCA8K_PORT_PAD_RGMII_RX_DELAY(delay) | +- QCA8K_PORT_PAD_RGMII_RX_DELAY_EN; +- } ++ qca8k_write(priv, reg, QCA8K_PORT_PAD_RGMII_EN); + +- /* Set RGMII delay based on the selected values */ +- qca8k_write(priv, reg, val); ++ /* Configure rgmii delay */ ++ qca8k_mac_config_setup_internal_delay(priv, cpu_port_index, reg); + + /* QCA8337 requires to set rgmii rx delay for all ports. + * This is enabled through PORT5_PAD_CTRL for all ports, +@@ -1411,6 +1436,13 @@ qca8k_phylink_mac_config(struct dsa_swit + QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE | + QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE, + val); ++ ++ /* From original code is reported port instability as SGMII also ++ * require delay set. Apply advised values here or take them from DT. ++ */ ++ if (state->interface == PHY_INTERFACE_MODE_SGMII) ++ qca8k_mac_config_setup_internal_delay(priv, cpu_port_index, reg); ++ + break; + default: + dev_err(ds->dev, "xMII mode %s not supported for port %d\n", +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -39,7 +39,9 @@ + #define QCA8K_REG_PORT5_PAD_CTRL 0x008 + #define QCA8K_REG_PORT6_PAD_CTRL 0x00c + #define QCA8K_PORT_PAD_RGMII_EN BIT(26) ++#define QCA8K_PORT_PAD_RGMII_TX_DELAY_MASK GENMASK(23, 22) + #define QCA8K_PORT_PAD_RGMII_TX_DELAY(x) ((x) << 22) ++#define QCA8K_PORT_PAD_RGMII_RX_DELAY_MASK GENMASK(21, 20) + #define QCA8K_PORT_PAD_RGMII_RX_DELAY(x) ((x) << 20) + #define QCA8K_PORT_PAD_RGMII_TX_DELAY_EN BIT(25) + #define QCA8K_PORT_PAD_RGMII_RX_DELAY_EN BIT(24) diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-14-net-dsa-qca8k-move-port-config-to-dedicated-struct.patch b/root/target/linux/generic/backport-5.15/747-v5.16-14-net-dsa-qca8k-move-port-config-to-dedicated-struct.patch new file mode 100755 index 00000000..b991798c --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-14-net-dsa-qca8k-move-port-config-to-dedicated-struct.patch @@ -0,0 +1,124 @@ +From fd0bb28c547f7c8affb1691128cece38f5b626a1 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:19 +0200 +Subject: net: dsa: qca8k: move port config to dedicated struct + +Move ports related config to dedicated struct to keep things organized. + +Signed-off-by: Ansuel Smith +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 26 +++++++++++++------------- + drivers/net/dsa/qca8k.h | 10 +++++++--- + 2 files changed, 20 insertions(+), 16 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -1019,7 +1019,7 @@ qca8k_parse_port_config(struct qca8k_pri + delay = 3; + } + +- priv->rgmii_tx_delay[cpu_port_index] = delay; ++ priv->ports_config.rgmii_tx_delay[cpu_port_index] = delay; + + delay = 0; + +@@ -1035,7 +1035,7 @@ qca8k_parse_port_config(struct qca8k_pri + delay = 3; + } + +- priv->rgmii_rx_delay[cpu_port_index] = delay; ++ priv->ports_config.rgmii_rx_delay[cpu_port_index] = delay; + + /* Skip sgmii parsing for rgmii* mode */ + if (mode == PHY_INTERFACE_MODE_RGMII || +@@ -1045,17 +1045,17 @@ qca8k_parse_port_config(struct qca8k_pri + break; + + if (of_property_read_bool(port_dn, "qca,sgmii-txclk-falling-edge")) +- priv->sgmii_tx_clk_falling_edge = true; ++ priv->ports_config.sgmii_tx_clk_falling_edge = true; + + if (of_property_read_bool(port_dn, "qca,sgmii-rxclk-falling-edge")) +- priv->sgmii_rx_clk_falling_edge = true; ++ priv->ports_config.sgmii_rx_clk_falling_edge = true; + + if (of_property_read_bool(port_dn, "qca,sgmii-enable-pll")) { +- priv->sgmii_enable_pll = true; ++ priv->ports_config.sgmii_enable_pll = true; + + if (priv->switch_id == QCA8K_ID_QCA8327) { + dev_err(priv->dev, "SGMII PLL should NOT be enabled for qca8327. Aborting enabling"); +- priv->sgmii_enable_pll = false; ++ priv->ports_config.sgmii_enable_pll = false; + } + + if (priv->switch_revision < 2) +@@ -1281,15 +1281,15 @@ qca8k_mac_config_setup_internal_delay(st + * not enabled. With ID or TX/RXID delay is enabled and set + * to the default and recommended value. + */ +- if (priv->rgmii_tx_delay[cpu_port_index]) { +- delay = priv->rgmii_tx_delay[cpu_port_index]; ++ if (priv->ports_config.rgmii_tx_delay[cpu_port_index]) { ++ delay = priv->ports_config.rgmii_tx_delay[cpu_port_index]; + + val |= QCA8K_PORT_PAD_RGMII_TX_DELAY(delay) | + QCA8K_PORT_PAD_RGMII_TX_DELAY_EN; + } + +- if (priv->rgmii_rx_delay[cpu_port_index]) { +- delay = priv->rgmii_rx_delay[cpu_port_index]; ++ if (priv->ports_config.rgmii_rx_delay[cpu_port_index]) { ++ delay = priv->ports_config.rgmii_rx_delay[cpu_port_index]; + + val |= QCA8K_PORT_PAD_RGMII_RX_DELAY(delay) | + QCA8K_PORT_PAD_RGMII_RX_DELAY_EN; +@@ -1397,7 +1397,7 @@ qca8k_phylink_mac_config(struct dsa_swit + + val |= QCA8K_SGMII_EN_SD; + +- if (priv->sgmii_enable_pll) ++ if (priv->ports_config.sgmii_enable_pll) + val |= QCA8K_SGMII_EN_PLL | QCA8K_SGMII_EN_RX | + QCA8K_SGMII_EN_TX; + +@@ -1425,10 +1425,10 @@ qca8k_phylink_mac_config(struct dsa_swit + val = 0; + + /* SGMII Clock phase configuration */ +- if (priv->sgmii_rx_clk_falling_edge) ++ if (priv->ports_config.sgmii_rx_clk_falling_edge) + val |= QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE; + +- if (priv->sgmii_tx_clk_falling_edge) ++ if (priv->ports_config.sgmii_tx_clk_falling_edge) + val |= QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE; + + if (val) +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -270,15 +270,19 @@ enum { + QCA8K_CPU_PORT6, + }; + +-struct qca8k_priv { +- u8 switch_id; +- u8 switch_revision; ++struct qca8k_ports_config { + bool sgmii_rx_clk_falling_edge; + bool sgmii_tx_clk_falling_edge; + bool sgmii_enable_pll; + u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */ + u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */ ++}; ++ ++struct qca8k_priv { ++ u8 switch_id; ++ u8 switch_revision; + bool legacy_phy_port_mapping; ++ struct qca8k_ports_config ports_config; + struct regmap *regmap; + struct mii_bus *bus; + struct ar8xxx_port_status port_sts[QCA8K_NUM_PORTS]; diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-15-dt-bindings-net-ipq8064-mdio-fix-warning-with-new-qc.patch b/root/target/linux/generic/backport-5.15/747-v5.16-15-dt-bindings-net-ipq8064-mdio-fix-warning-with-new-qc.patch new file mode 100755 index 00000000..f7cb5141 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-15-dt-bindings-net-ipq8064-mdio-fix-warning-with-new-qc.patch @@ -0,0 +1,26 @@ +From e52073a8e3086046a098b8a7cbeb282ff0cdb424 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Thu, 14 Oct 2021 00:39:20 +0200 +Subject: dt-bindings: net: ipq8064-mdio: fix warning with new qca8k switch + +Fix warning now that we have qca8k switch Documentation using yaml. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + Documentation/devicetree/bindings/net/qcom,ipq8064-mdio.yaml | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/Documentation/devicetree/bindings/net/qcom,ipq8064-mdio.yaml ++++ b/Documentation/devicetree/bindings/net/qcom,ipq8064-mdio.yaml +@@ -51,6 +51,9 @@ examples: + switch@10 { + compatible = "qca,qca8337"; + reg = <0x10>; +- /* ... */ ++ ++ ports { ++ /* ... */ ++ }; + }; + }; diff --git a/root/target/linux/generic/backport-5.15/747-v5.16-16-dt-bindings-net-dsa-qca8k-convert-to-YAML-schema.patch b/root/target/linux/generic/backport-5.15/747-v5.16-16-dt-bindings-net-dsa-qca8k-convert-to-YAML-schema.patch new file mode 100755 index 00000000..b9bce97d --- /dev/null +++ b/root/target/linux/generic/backport-5.15/747-v5.16-16-dt-bindings-net-dsa-qca8k-convert-to-YAML-schema.patch @@ -0,0 +1,631 @@ +From d291fbb8245d5ba04979fed85575860a5cea7196 Mon Sep 17 00:00:00 2001 +From: Matthew Hagan +Date: Thu, 14 Oct 2021 00:39:21 +0200 +Subject: dt-bindings: net: dsa: qca8k: convert to YAML schema + +Convert the qca8k bindings to YAML format. + +Signed-off-by: Matthew Hagan +Co-developed-by: Ansuel Smith +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + .../devicetree/bindings/net/dsa/qca8k.txt | 245 -------------- + .../devicetree/bindings/net/dsa/qca8k.yaml | 362 +++++++++++++++++++++ + 2 files changed, 362 insertions(+), 245 deletions(-) + delete mode 100644 Documentation/devicetree/bindings/net/dsa/qca8k.txt + create mode 100644 Documentation/devicetree/bindings/net/dsa/qca8k.yaml + +--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt ++++ /dev/null +@@ -1,245 +0,0 @@ +-* Qualcomm Atheros QCA8xxx switch family +- +-Required properties: +- +-- compatible: should be one of: +- "qca,qca8328": referenced as AR8328(N)-AK1(A/B) QFN 176 pin package +- "qca,qca8327": referenced as AR8327(N)-AL1A DR-QFN 148 pin package +- "qca,qca8334": referenced as QCA8334-AL3C QFN 88 pin package +- "qca,qca8337": referenced as QCA8337N-AL3(B/C) DR-QFN 148 pin package +- +-- #size-cells: must be 0 +-- #address-cells: must be 1 +- +-Optional properties: +- +-- reset-gpios: GPIO to be used to reset the whole device +-- qca,ignore-power-on-sel: Ignore power on pin strapping to configure led open +- drain or eeprom presence. This is needed for broken +- devices that have wrong configuration or when the oem +- decided to not use pin strapping and fallback to sw +- regs. +-- qca,led-open-drain: Set leds to open-drain mode. This requires the +- qca,ignore-power-on-sel to be set or the driver will fail +- to probe. This is needed if the oem doesn't use pin +- strapping to set this mode and prefers to set it using sw +- regs. The pin strapping related to led open drain mode is +- the pin B68 for QCA832x and B49 for QCA833x +- +-Subnodes: +- +-The integrated switch subnode should be specified according to the binding +-described in dsa/dsa.txt. If the QCA8K switch is connect to a SoC's external +-mdio-bus each subnode describing a port needs to have a valid phandle +-referencing the internal PHY it is connected to. This is because there's no +-N:N mapping of port and PHY id. +-To declare the internal mdio-bus configuration, declare a mdio node in the +-switch node and declare the phandle for the port referencing the internal +-PHY is connected to. In this config a internal mdio-bus is registered and +-the mdio MASTER is used as communication. +- +-Don't use mixed external and internal mdio-bus configurations, as this is +-not supported by the hardware. +- +-This switch support 2 CPU port. Normally and advised configuration is with +-CPU port set to port 0. It is also possible to set the CPU port to port 6 +-if the device requires it. The driver will configure the switch to the defined +-port. With both CPU port declared the first CPU port is selected as primary +-and the secondary CPU ignored. +- +-A CPU port node has the following optional node: +- +-- fixed-link : Fixed-link subnode describing a link to a non-MDIO +- managed entity. See +- Documentation/devicetree/bindings/net/fixed-link.txt +- for details. +-- qca,sgmii-rxclk-falling-edge: Set the receive clock phase to falling edge. +- Mostly used in qca8327 with CPU port 0 set to +- sgmii. +-- qca,sgmii-txclk-falling-edge: Set the transmit clock phase to falling edge. +-- qca,sgmii-enable-pll : For SGMII CPU port, explicitly enable PLL, TX and RX +- chain along with Signal Detection. +- This should NOT be enabled for qca8327. If enabled with +- qca8327 the sgmii port won't correctly init and an err +- is printed. +- This can be required for qca8337 switch with revision 2. +- A warning is displayed when used with revision greater +- 2. +- With CPU port set to sgmii and qca8337 it is advised +- to set this unless a communication problem is observed. +- +-For QCA8K the 'fixed-link' sub-node supports only the following properties: +- +-- 'speed' (integer, mandatory), to indicate the link speed. Accepted +- values are 10, 100 and 1000 +-- 'full-duplex' (boolean, optional), to indicate that full duplex is +- used. When absent, half duplex is assumed. +- +-Examples: +- +-for the external mdio-bus configuration: +- +- &mdio0 { +- phy_port1: phy@0 { +- reg = <0>; +- }; +- +- phy_port2: phy@1 { +- reg = <1>; +- }; +- +- phy_port3: phy@2 { +- reg = <2>; +- }; +- +- phy_port4: phy@3 { +- reg = <3>; +- }; +- +- phy_port5: phy@4 { +- reg = <4>; +- }; +- +- switch@10 { +- compatible = "qca,qca8337"; +- #address-cells = <1>; +- #size-cells = <0>; +- +- reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>; +- reg = <0x10>; +- +- ports { +- #address-cells = <1>; +- #size-cells = <0>; +- port@0 { +- reg = <0>; +- label = "cpu"; +- ethernet = <&gmac1>; +- phy-mode = "rgmii"; +- fixed-link { +- speed = 1000; +- full-duplex; +- }; +- }; +- +- port@1 { +- reg = <1>; +- label = "lan1"; +- phy-handle = <&phy_port1>; +- }; +- +- port@2 { +- reg = <2>; +- label = "lan2"; +- phy-handle = <&phy_port2>; +- }; +- +- port@3 { +- reg = <3>; +- label = "lan3"; +- phy-handle = <&phy_port3>; +- }; +- +- port@4 { +- reg = <4>; +- label = "lan4"; +- phy-handle = <&phy_port4>; +- }; +- +- port@5 { +- reg = <5>; +- label = "wan"; +- phy-handle = <&phy_port5>; +- }; +- }; +- }; +- }; +- +-for the internal master mdio-bus configuration: +- +- &mdio0 { +- switch@10 { +- compatible = "qca,qca8337"; +- #address-cells = <1>; +- #size-cells = <0>; +- +- reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>; +- reg = <0x10>; +- +- ports { +- #address-cells = <1>; +- #size-cells = <0>; +- +- port@0 { +- reg = <0>; +- label = "cpu"; +- ethernet = <&gmac1>; +- phy-mode = "rgmii"; +- fixed-link { +- speed = 1000; +- full-duplex; +- }; +- }; +- +- port@1 { +- reg = <1>; +- label = "lan1"; +- phy-mode = "internal"; +- phy-handle = <&phy_port1>; +- }; +- +- port@2 { +- reg = <2>; +- label = "lan2"; +- phy-mode = "internal"; +- phy-handle = <&phy_port2>; +- }; +- +- port@3 { +- reg = <3>; +- label = "lan3"; +- phy-mode = "internal"; +- phy-handle = <&phy_port3>; +- }; +- +- port@4 { +- reg = <4>; +- label = "lan4"; +- phy-mode = "internal"; +- phy-handle = <&phy_port4>; +- }; +- +- port@5 { +- reg = <5>; +- label = "wan"; +- phy-mode = "internal"; +- phy-handle = <&phy_port5>; +- }; +- }; +- +- mdio { +- #address-cells = <1>; +- #size-cells = <0>; +- +- phy_port1: phy@0 { +- reg = <0>; +- }; +- +- phy_port2: phy@1 { +- reg = <1>; +- }; +- +- phy_port3: phy@2 { +- reg = <2>; +- }; +- +- phy_port4: phy@3 { +- reg = <3>; +- }; +- +- phy_port5: phy@4 { +- reg = <4>; +- }; +- }; +- }; +- }; +--- /dev/null ++++ b/Documentation/devicetree/bindings/net/dsa/qca8k.yaml +@@ -0,0 +1,362 @@ ++# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) ++%YAML 1.2 ++--- ++$id: http://devicetree.org/schemas/net/dsa/qca8k.yaml# ++$schema: http://devicetree.org/meta-schemas/core.yaml# ++ ++title: Qualcomm Atheros QCA83xx switch family ++ ++maintainers: ++ - John Crispin ++ ++description: ++ If the QCA8K switch is connect to an SoC's external mdio-bus, each subnode ++ describing a port needs to have a valid phandle referencing the internal PHY ++ it is connected to. This is because there is no N:N mapping of port and PHY ++ ID. To declare the internal mdio-bus configuration, declare an MDIO node in ++ the switch node and declare the phandle for the port, referencing the internal ++ PHY it is connected to. In this config, an internal mdio-bus is registered and ++ the MDIO master is used for communication. Mixed external and internal ++ mdio-bus configurations are not supported by the hardware. ++ ++properties: ++ compatible: ++ oneOf: ++ - enum: ++ - qca,qca8327 ++ - qca,qca8328 ++ - qca,qca8334 ++ - qca,qca8337 ++ description: | ++ qca,qca8328: referenced as AR8328(N)-AK1(A/B) QFN 176 pin package ++ qca,qca8327: referenced as AR8327(N)-AL1A DR-QFN 148 pin package ++ qca,qca8334: referenced as QCA8334-AL3C QFN 88 pin package ++ qca,qca8337: referenced as QCA8337N-AL3(B/C) DR-QFN 148 pin package ++ ++ reg: ++ maxItems: 1 ++ ++ reset-gpios: ++ description: ++ GPIO to be used to reset the whole device ++ maxItems: 1 ++ ++ qca,ignore-power-on-sel: ++ $ref: /schemas/types.yaml#/definitions/flag ++ description: ++ Ignore power-on pin strapping to configure LED open-drain or EEPROM ++ presence. This is needed for devices with incorrect configuration or when ++ the OEM has decided not to use pin strapping and falls back to SW regs. ++ ++ qca,led-open-drain: ++ $ref: /schemas/types.yaml#/definitions/flag ++ description: ++ Set LEDs to open-drain mode. This requires the qca,ignore-power-on-sel to ++ be set, otherwise the driver will fail at probe. This is required if the ++ OEM does not use pin strapping to set this mode and prefers to set it ++ using SW regs. The pin strappings related to LED open-drain mode are ++ B68 on the QCA832x and B49 on the QCA833x. ++ ++ mdio: ++ type: object ++ description: Qca8k switch have an internal mdio to access switch port. ++ If this is not present, the legacy mapping is used and the ++ internal mdio access is used. ++ With the legacy mapping the reg corresponding to the internal ++ mdio is the switch reg with an offset of -1. ++ ++ properties: ++ '#address-cells': ++ const: 1 ++ '#size-cells': ++ const: 0 ++ ++ patternProperties: ++ "^(ethernet-)?phy@[0-4]$": ++ type: object ++ ++ allOf: ++ - $ref: "http://devicetree.org/schemas/net/mdio.yaml#" ++ ++ properties: ++ reg: ++ maxItems: 1 ++ ++ required: ++ - reg ++ ++patternProperties: ++ "^(ethernet-)?ports$": ++ type: object ++ properties: ++ '#address-cells': ++ const: 1 ++ '#size-cells': ++ const: 0 ++ ++ patternProperties: ++ "^(ethernet-)?port@[0-6]$": ++ type: object ++ description: Ethernet switch ports ++ ++ properties: ++ reg: ++ description: Port number ++ ++ label: ++ description: ++ Describes the label associated with this port, which will become ++ the netdev name ++ $ref: /schemas/types.yaml#/definitions/string ++ ++ link: ++ description: ++ Should be a list of phandles to other switch's DSA port. This ++ port is used as the outgoing port towards the phandle ports. The ++ full routing information must be given, not just the one hop ++ routes to neighbouring switches ++ $ref: /schemas/types.yaml#/definitions/phandle-array ++ ++ ethernet: ++ description: ++ Should be a phandle to a valid Ethernet device node. This host ++ device is what the switch port is connected to ++ $ref: /schemas/types.yaml#/definitions/phandle ++ ++ phy-handle: true ++ ++ phy-mode: true ++ ++ fixed-link: true ++ ++ mac-address: true ++ ++ sfp: true ++ ++ qca,sgmii-rxclk-falling-edge: ++ $ref: /schemas/types.yaml#/definitions/flag ++ description: ++ Set the receive clock phase to falling edge. Mostly commonly used on ++ the QCA8327 with CPU port 0 set to SGMII. ++ ++ qca,sgmii-txclk-falling-edge: ++ $ref: /schemas/types.yaml#/definitions/flag ++ description: ++ Set the transmit clock phase to falling edge. ++ ++ qca,sgmii-enable-pll: ++ $ref: /schemas/types.yaml#/definitions/flag ++ description: ++ For SGMII CPU port, explicitly enable PLL, TX and RX chain along with ++ Signal Detection. On the QCA8327 this should not be enabled, otherwise ++ the SGMII port will not initialize. When used on the QCA8337, revision 3 ++ or greater, a warning will be displayed. When the CPU port is set to ++ SGMII on the QCA8337, it is advised to set this unless a communication ++ issue is observed. ++ ++ required: ++ - reg ++ ++ additionalProperties: false ++ ++oneOf: ++ - required: ++ - ports ++ - required: ++ - ethernet-ports ++ ++required: ++ - compatible ++ - reg ++ ++additionalProperties: true ++ ++examples: ++ - | ++ #include ++ ++ mdio { ++ #address-cells = <1>; ++ #size-cells = <0>; ++ ++ external_phy_port1: ethernet-phy@0 { ++ reg = <0>; ++ }; ++ ++ external_phy_port2: ethernet-phy@1 { ++ reg = <1>; ++ }; ++ ++ external_phy_port3: ethernet-phy@2 { ++ reg = <2>; ++ }; ++ ++ external_phy_port4: ethernet-phy@3 { ++ reg = <3>; ++ }; ++ ++ external_phy_port5: ethernet-phy@4 { ++ reg = <4>; ++ }; ++ ++ switch@10 { ++ compatible = "qca,qca8337"; ++ #address-cells = <1>; ++ #size-cells = <0>; ++ reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>; ++ reg = <0x10>; ++ ++ ports { ++ #address-cells = <1>; ++ #size-cells = <0>; ++ ++ port@0 { ++ reg = <0>; ++ label = "cpu"; ++ ethernet = <&gmac1>; ++ phy-mode = "rgmii"; ++ ++ fixed-link { ++ speed = <1000>; ++ full-duplex; ++ }; ++ }; ++ ++ port@1 { ++ reg = <1>; ++ label = "lan1"; ++ phy-handle = <&external_phy_port1>; ++ }; ++ ++ port@2 { ++ reg = <2>; ++ label = "lan2"; ++ phy-handle = <&external_phy_port2>; ++ }; ++ ++ port@3 { ++ reg = <3>; ++ label = "lan3"; ++ phy-handle = <&external_phy_port3>; ++ }; ++ ++ port@4 { ++ reg = <4>; ++ label = "lan4"; ++ phy-handle = <&external_phy_port4>; ++ }; ++ ++ port@5 { ++ reg = <5>; ++ label = "wan"; ++ phy-handle = <&external_phy_port5>; ++ }; ++ }; ++ }; ++ }; ++ - | ++ #include ++ ++ mdio { ++ #address-cells = <1>; ++ #size-cells = <0>; ++ ++ switch@10 { ++ compatible = "qca,qca8337"; ++ #address-cells = <1>; ++ #size-cells = <0>; ++ reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>; ++ reg = <0x10>; ++ ++ ports { ++ #address-cells = <1>; ++ #size-cells = <0>; ++ ++ port@0 { ++ reg = <0>; ++ label = "cpu"; ++ ethernet = <&gmac1>; ++ phy-mode = "rgmii"; ++ ++ fixed-link { ++ speed = <1000>; ++ full-duplex; ++ }; ++ }; ++ ++ port@1 { ++ reg = <1>; ++ label = "lan1"; ++ phy-mode = "internal"; ++ phy-handle = <&internal_phy_port1>; ++ }; ++ ++ port@2 { ++ reg = <2>; ++ label = "lan2"; ++ phy-mode = "internal"; ++ phy-handle = <&internal_phy_port2>; ++ }; ++ ++ port@3 { ++ reg = <3>; ++ label = "lan3"; ++ phy-mode = "internal"; ++ phy-handle = <&internal_phy_port3>; ++ }; ++ ++ port@4 { ++ reg = <4>; ++ label = "lan4"; ++ phy-mode = "internal"; ++ phy-handle = <&internal_phy_port4>; ++ }; ++ ++ port@5 { ++ reg = <5>; ++ label = "wan"; ++ phy-mode = "internal"; ++ phy-handle = <&internal_phy_port5>; ++ }; ++ ++ port@6 { ++ reg = <0>; ++ label = "cpu"; ++ ethernet = <&gmac1>; ++ phy-mode = "sgmii"; ++ ++ qca,sgmii-rxclk-falling-edge; ++ ++ fixed-link { ++ speed = <1000>; ++ full-duplex; ++ }; ++ }; ++ }; ++ ++ mdio { ++ #address-cells = <1>; ++ #size-cells = <0>; ++ ++ internal_phy_port1: ethernet-phy@0 { ++ reg = <0>; ++ }; ++ ++ internal_phy_port2: ethernet-phy@1 { ++ reg = <1>; ++ }; ++ ++ internal_phy_port3: ethernet-phy@2 { ++ reg = <2>; ++ }; ++ ++ internal_phy_port4: ethernet-phy@3 { ++ reg = <3>; ++ }; ++ ++ internal_phy_port5: ethernet-phy@4 { ++ reg = <4>; ++ }; ++ }; ++ }; ++ }; diff --git a/root/target/linux/generic/backport-5.15/748-v5.16-net-dsa-qca8k-fix-delay-applied-to-wrong-cpu-in-parse-p.patch b/root/target/linux/generic/backport-5.15/748-v5.16-net-dsa-qca8k-fix-delay-applied-to-wrong-cpu-in-parse-p.patch new file mode 100755 index 00000000..a510cfdc --- /dev/null +++ b/root/target/linux/generic/backport-5.15/748-v5.16-net-dsa-qca8k-fix-delay-applied-to-wrong-cpu-in-parse-p.patch @@ -0,0 +1,28 @@ +From 06dd34a628ae5b6a839b757e746de165d6789ca8 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Sun, 17 Oct 2021 16:56:46 +0200 +Subject: net: dsa: qca8k: fix delay applied to wrong cpu in parse_port_config + +Fix delay settings applied to wrong cpu in parse_port_config. The delay +values is set to the wrong index as the cpu_port_index is incremented +too early. Start the cpu_port_index to -1 so the correct value is +applied to address also the case with invalid phy mode and not available +port. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -976,7 +976,7 @@ qca8k_setup_of_pws_reg(struct qca8k_priv + static int + qca8k_parse_port_config(struct qca8k_priv *priv) + { +- int port, cpu_port_index = 0, ret; ++ int port, cpu_port_index = -1, ret; + struct device_node *port_dn; + phy_interface_t mode; + struct dsa_port *dp; diff --git a/root/target/linux/generic/backport-5.15/749-v5.16-net-dsa-qca8k-tidy-for-loop-in-setup-and-add-cpu-port-c.patch b/root/target/linux/generic/backport-5.15/749-v5.16-net-dsa-qca8k-tidy-for-loop-in-setup-and-add-cpu-port-c.patch new file mode 100755 index 00000000..71fa3022 --- /dev/null +++ b/root/target/linux/generic/backport-5.15/749-v5.16-net-dsa-qca8k-tidy-for-loop-in-setup-and-add-cpu-port-c.patch @@ -0,0 +1,151 @@ +From 040e926f5813a5f4cc18dbff7c942d1e52f368f2 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Tue, 19 Oct 2021 02:08:50 +0200 +Subject: net: dsa: qca8k: tidy for loop in setup and add cpu port check + +Tidy and organize qca8k setup function from multiple for loop. +Change for loop in bridge leave/join to scan all port and skip cpu port. +No functional change intended. + +Signed-off-by: Ansuel Smith +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 74 +++++++++++++++++++++++++++++-------------------- + 1 file changed, 44 insertions(+), 30 deletions(-) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -1122,28 +1122,34 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + dev_warn(priv->dev, "mib init failed"); + +- /* Enable QCA header mode on the cpu port */ +- ret = qca8k_write(priv, QCA8K_REG_PORT_HDR_CTRL(cpu_port), +- QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_TX_S | +- QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_RX_S); +- if (ret) { +- dev_err(priv->dev, "failed enabling QCA header mode"); +- return ret; +- } +- +- /* Disable forwarding by default on all ports */ ++ /* Initial setup of all ports */ + for (i = 0; i < QCA8K_NUM_PORTS; i++) { ++ /* Disable forwarding by default on all ports */ + ret = qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(i), + QCA8K_PORT_LOOKUP_MEMBER, 0); + if (ret) + return ret; +- } + +- /* Disable MAC by default on all ports */ +- for (i = 1; i < QCA8K_NUM_PORTS; i++) +- qca8k_port_set_status(priv, i, 0); ++ /* Enable QCA header mode on all cpu ports */ ++ if (dsa_is_cpu_port(ds, i)) { ++ ret = qca8k_write(priv, QCA8K_REG_PORT_HDR_CTRL(i), ++ QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_TX_S | ++ QCA8K_PORT_HDR_CTRL_ALL << QCA8K_PORT_HDR_CTRL_RX_S); ++ if (ret) { ++ dev_err(priv->dev, "failed enabling QCA header mode"); ++ return ret; ++ } ++ } ++ ++ /* Disable MAC by default on all user ports */ ++ if (dsa_is_user_port(ds, i)) ++ qca8k_port_set_status(priv, i, 0); ++ } + +- /* Forward all unknown frames to CPU port for Linux processing */ ++ /* Forward all unknown frames to CPU port for Linux processing ++ * Notice that in multi-cpu config only one port should be set ++ * for igmp, unknown, multicast and broadcast packet ++ */ + ret = qca8k_write(priv, QCA8K_REG_GLOBAL_FW_CTRL1, + BIT(cpu_port) << QCA8K_GLOBAL_FW_CTRL1_IGMP_DP_S | + BIT(cpu_port) << QCA8K_GLOBAL_FW_CTRL1_BC_DP_S | +@@ -1152,11 +1158,13 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + return ret; + +- /* Setup connection between CPU port & user ports */ ++ /* Setup connection between CPU port & user ports ++ * Configure specific switch configuration for ports ++ */ + for (i = 0; i < QCA8K_NUM_PORTS; i++) { + /* CPU port gets connected to all user ports of the switch */ + if (dsa_is_cpu_port(ds, i)) { +- ret = qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(cpu_port), ++ ret = qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(i), + QCA8K_PORT_LOOKUP_MEMBER, dsa_user_ports(ds)); + if (ret) + return ret; +@@ -1193,16 +1201,14 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + return ret; + } +- } + +- /* The port 5 of the qca8337 have some problem in flood condition. The +- * original legacy driver had some specific buffer and priority settings +- * for the different port suggested by the QCA switch team. Add this +- * missing settings to improve switch stability under load condition. +- * This problem is limited to qca8337 and other qca8k switch are not affected. +- */ +- if (priv->switch_id == QCA8K_ID_QCA8337) { +- for (i = 0; i < QCA8K_NUM_PORTS; i++) { ++ /* The port 5 of the qca8337 have some problem in flood condition. The ++ * original legacy driver had some specific buffer and priority settings ++ * for the different port suggested by the QCA switch team. Add this ++ * missing settings to improve switch stability under load condition. ++ * This problem is limited to qca8337 and other qca8k switch are not affected. ++ */ ++ if (priv->switch_id == QCA8K_ID_QCA8337) { + switch (i) { + /* The 2 CPU port and port 5 requires some different + * priority than any other ports. +@@ -1238,6 +1244,12 @@ qca8k_setup(struct dsa_switch *ds) + QCA8K_PORT_HOL_CTRL1_WRED_EN, + mask); + } ++ ++ /* Set initial MTU for every port. ++ * We have only have a general MTU setting. So track ++ * every port and set the max across all port. ++ */ ++ priv->port_mtu[i] = ETH_FRAME_LEN + ETH_FCS_LEN; + } + + /* Special GLOBAL_FC_THRESH value are needed for ar8327 switch */ +@@ -1251,8 +1263,6 @@ qca8k_setup(struct dsa_switch *ds) + } + + /* Setup our port MTUs to match power on defaults */ +- for (i = 0; i < QCA8K_NUM_PORTS; i++) +- priv->port_mtu[i] = ETH_FRAME_LEN + ETH_FCS_LEN; + ret = qca8k_write(priv, QCA8K_MAX_FRAME_SIZE, ETH_FRAME_LEN + ETH_FCS_LEN); + if (ret) + dev_warn(priv->dev, "failed setting MTU settings"); +@@ -1728,7 +1738,9 @@ qca8k_port_bridge_join(struct dsa_switch + cpu_port = dsa_to_port(ds, port)->cpu_dp->index; + port_mask = BIT(cpu_port); + +- for (i = 1; i < QCA8K_NUM_PORTS; i++) { ++ for (i = 0; i < QCA8K_NUM_PORTS; i++) { ++ if (dsa_is_cpu_port(ds, i)) ++ continue; + if (dsa_to_port(ds, i)->bridge_dev != br) + continue; + /* Add this port to the portvlan mask of the other ports +@@ -1758,7 +1770,9 @@ qca8k_port_bridge_leave(struct dsa_switc + + cpu_port = dsa_to_port(ds, port)->cpu_dp->index; + +- for (i = 1; i < QCA8K_NUM_PORTS; i++) { ++ for (i = 0; i < QCA8K_NUM_PORTS; i++) { ++ if (dsa_is_cpu_port(ds, i)) ++ continue; + if (dsa_to_port(ds, i)->bridge_dev != br) + continue; + /* Remove this port to the portvlan mask of the other ports diff --git a/root/target/linux/generic/backport-5.15/750-v5.16-net-dsa-qca8k-make-sure-pad0-mac06-exchange-is-disabled.patch b/root/target/linux/generic/backport-5.15/750-v5.16-net-dsa-qca8k-make-sure-pad0-mac06-exchange-is-disabled.patch new file mode 100755 index 00000000..4a61703c --- /dev/null +++ b/root/target/linux/generic/backport-5.15/750-v5.16-net-dsa-qca8k-make-sure-pad0-mac06-exchange-is-disabled.patch @@ -0,0 +1,47 @@ +From 5f15d392dcb4aa250a63d6f2c5adfc26c0aedc78 Mon Sep 17 00:00:00 2001 +From: Ansuel Smith +Date: Tue, 2 Nov 2021 19:30:41 +0100 +Subject: net: dsa: qca8k: make sure PAD0 MAC06 exchange is disabled + +Some device set MAC06 exchange in the bootloader. This cause some +problem as we don't support this strange mode and we just set the port6 +as the primary CPU port. With MAC06 exchange, PAD0 reg configure port6 +instead of port0. Add an extra check and explicitly disable MAC06 exchange +to correctly configure the port PAD config. + +Signed-off-by: Ansuel Smith +Fixes: 3fcf734aa482 ("net: dsa: qca8k: add support for cpu port 6") +Reviewed-by: Vladimir Oltean +Signed-off-by: David S. Miller +--- + drivers/net/dsa/qca8k.c | 8 ++++++++ + drivers/net/dsa/qca8k.h | 1 + + 2 files changed, 9 insertions(+) + +--- a/drivers/net/dsa/qca8k.c ++++ b/drivers/net/dsa/qca8k.c +@@ -1109,6 +1109,14 @@ qca8k_setup(struct dsa_switch *ds) + if (ret) + return ret; + ++ /* Make sure MAC06 is disabled */ ++ ret = qca8k_reg_clear(priv, QCA8K_REG_PORT0_PAD_CTRL, ++ QCA8K_PORT0_PAD_MAC06_EXCHANGE_EN); ++ if (ret) { ++ dev_err(priv->dev, "failed disabling MAC06 exchange"); ++ return ret; ++ } ++ + /* Enable CPU Port */ + ret = qca8k_reg_set(priv, QCA8K_REG_GLOBAL_FW_CTRL0, + QCA8K_GLOBAL_FW_CTRL0_CPU_PORT_EN); +--- a/drivers/net/dsa/qca8k.h ++++ b/drivers/net/dsa/qca8k.h +@@ -34,6 +34,7 @@ + #define QCA8K_MASK_CTRL_DEVICE_ID_MASK GENMASK(15, 8) + #define QCA8K_MASK_CTRL_DEVICE_ID(x) ((x) >> 8) + #define QCA8K_REG_PORT0_PAD_CTRL 0x004 ++#define QCA8K_PORT0_PAD_MAC06_EXCHANGE_EN BIT(31) + #define QCA8K_PORT0_PAD_SGMII_RXCLK_FALLING_EDGE BIT(19) + #define QCA8K_PORT0_PAD_SGMII_TXCLK_FALLING_EDGE BIT(18) + #define QCA8K_REG_PORT5_PAD_CTRL 0x008 diff --git a/root/target/linux/generic/hack-5.15/204-module_strip.patch b/root/target/linux/generic/hack-5.15/204-module_strip.patch index 0968d6a1..9b25707f 100755 --- a/root/target/linux/generic/hack-5.15/204-module_strip.patch +++ b/root/target/linux/generic/hack-5.15/204-module_strip.patch @@ -88,7 +88,7 @@ Signed-off-by: Felix Fietkau --- a/init/Kconfig +++ b/init/Kconfig -@@ -2347,6 +2347,13 @@ config UNUSED_KSYMS_WHITELIST +@@ -2324,6 +2324,13 @@ config UNUSED_KSYMS_WHITELIST one per line. The path can be absolute, or relative to the kernel source tree. @@ -104,23 +104,7 @@ Signed-off-by: Felix Fietkau config MODULES_TREE_LOOKUP --- a/kernel/module.c +++ b/kernel/module.c -@@ -1218,6 +1218,7 @@ static struct module_attribute *modinfo_ - - static const char vermagic[] = VERMAGIC_STRING; - -+#if defined(CONFIG_MODVERSIONS) || !defined(CONFIG_MODULE_STRIPPED) - static int try_to_force_load(struct module *mod, const char *reason) - { - #ifdef CONFIG_MODULE_FORCE_LOAD -@@ -1229,6 +1230,7 @@ static int try_to_force_load(struct modu - return -ENOEXEC; - #endif - } -+#endif - - #ifdef CONFIG_MODVERSIONS - -@@ -3227,9 +3229,11 @@ static int setup_load_info(struct load_i +@@ -3227,9 +3227,11 @@ static int setup_load_info(struct load_i static int check_modinfo(struct module *mod, struct load_info *info, int flags) { @@ -133,7 +117,7 @@ Signed-off-by: Felix Fietkau if (flags & MODULE_INIT_IGNORE_VERMAGIC) modmagic = NULL; -@@ -3250,6 +3254,7 @@ static int check_modinfo(struct module * +@@ -3250,6 +3252,7 @@ static int check_modinfo(struct module * mod->name); add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); } @@ -143,7 +127,7 @@ Signed-off-by: Felix Fietkau --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c -@@ -2033,7 +2033,9 @@ static void read_symbols(const char *mod +@@ -2024,7 +2024,9 @@ static void read_symbols(const char *mod symname = remove_dot(info.strtab + sym->st_name); handle_symbol(mod, &info, sym, symname); @@ -153,7 +137,7 @@ Signed-off-by: Felix Fietkau } for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { -@@ -2212,8 +2214,10 @@ static void add_header(struct buffer *b, +@@ -2203,8 +2205,10 @@ static void add_header(struct buffer *b, buf_printf(b, "BUILD_SALT;\n"); buf_printf(b, "BUILD_LTO_INFO;\n"); buf_printf(b, "\n"); @@ -164,7 +148,7 @@ Signed-off-by: Felix Fietkau buf_printf(b, "\n"); buf_printf(b, "__visible struct module __this_module\n"); buf_printf(b, "__section(\".gnu.linkonce.this_module\") = {\n"); -@@ -2230,8 +2234,10 @@ static void add_header(struct buffer *b, +@@ -2221,8 +2225,10 @@ static void add_header(struct buffer *b, static void add_intree_flag(struct buffer *b, int is_intree) { @@ -175,7 +159,7 @@ Signed-off-by: Felix Fietkau } /* Cannot check for assembler */ -@@ -2244,8 +2250,10 @@ static void add_retpoline(struct buffer +@@ -2235,8 +2241,10 @@ static void add_retpoline(struct buffer static void add_staging_flag(struct buffer *b, const char *name) { @@ -186,7 +170,7 @@ Signed-off-by: Felix Fietkau } /** -@@ -2325,11 +2333,13 @@ static void add_depends(struct buffer *b +@@ -2316,11 +2324,13 @@ static void add_depends(struct buffer *b static void add_srcversion(struct buffer *b, struct module *mod) { @@ -200,7 +184,7 @@ Signed-off-by: Felix Fietkau } static void write_buf(struct buffer *b, const char *fname) -@@ -2578,7 +2588,9 @@ int main(int argc, char **argv) +@@ -2569,7 +2579,9 @@ int main(int argc, char **argv) add_staging_flag(&buf, mod->name); add_versions(&buf, mod); add_depends(&buf, mod); diff --git a/root/target/linux/generic/hack-5.15/205-kconfig-exit.patch b/root/target/linux/generic/hack-5.15/205-kconfig-exit.patch deleted file mode 100755 index e61c3ffd..00000000 --- a/root/target/linux/generic/hack-5.15/205-kconfig-exit.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- a/scripts/kconfig/conf.c -+++ b/scripts/kconfig/conf.c -@@ -435,6 +435,8 @@ static int conf_sym(struct menu *menu) - break; - continue; - case 0: -+ if (!sym_has_value(sym) && !tty_stdio && getenv("FAIL_ON_UNCONFIGURED")) -+ exit(1); - newval = oldval; - break; - case '?': diff --git a/root/target/linux/generic/hack-5.15/210-darwin_scripts_include.patch b/root/target/linux/generic/hack-5.15/210-darwin_scripts_include.patch index be6adc0d..d68e2f88 100755 --- a/root/target/linux/generic/hack-5.15/210-darwin_scripts_include.patch +++ b/root/target/linux/generic/hack-5.15/210-darwin_scripts_include.patch @@ -3039,7 +3039,7 @@ Signed-off-by: Florian Fainelli main(int argc, char **argv) --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h -@@ -8,7 +8,11 @@ +@@ -9,7 +9,11 @@ #include #include #include diff --git a/root/target/linux/generic/hack-5.15/212-tools_portability.patch b/root/target/linux/generic/hack-5.15/212-tools_portability.patch index b488155f..ffbb7d14 100755 --- a/root/target/linux/generic/hack-5.15/212-tools_portability.patch +++ b/root/target/linux/generic/hack-5.15/212-tools_portability.patch @@ -68,10 +68,14 @@ Signed-off-by: Felix Fietkau +#endif --- a/tools/include/linux/types.h +++ b/tools/include/linux/types.h -@@ -10,8 +10,12 @@ - #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ - #endif +@@ -6,12 +6,13 @@ + #include + #include +-#ifndef __SANE_USERSPACE_TYPES__ + #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ +-#endif +- +#ifndef __linux__ +#include +#else diff --git a/root/target/linux/generic/hack-5.15/220-arm-gc_sections.patch b/root/target/linux/generic/hack-5.15/220-arm-gc_sections.patch deleted file mode 100755 index 305556be..00000000 --- a/root/target/linux/generic/hack-5.15/220-arm-gc_sections.patch +++ /dev/null @@ -1,122 +0,0 @@ -From e3d8676f5722b7622685581e06e8f53e6138e3ab Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 15 Jul 2017 23:42:36 +0200 -Subject: use -ffunction-sections, -fdata-sections and --gc-sections - -In combination with kernel symbol export stripping this significantly reduces -the kernel image size. Used on both ARM and MIPS architectures. - -Signed-off-by: Felix Fietkau -Signed-off-by: Jonas Gorski -Signed-off-by: Gabor Juhos ---- ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig -@@ -117,6 +117,7 @@ config ARM - select HAVE_UID16 - select HAVE_VIRT_CPU_ACCOUNTING_GEN - select IRQ_FORCED_THREADING -+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select MODULES_USE_ELF_REL - select NEED_DMA_MAP_STATE - select OF_EARLY_FLATTREE if OF ---- a/arch/arm/boot/compressed/Makefile -+++ b/arch/arm/boot/compressed/Makefile -@@ -92,6 +92,7 @@ endif - ifeq ($(CONFIG_USE_OF),y) - OBJS += $(libfdt_objs) fdt_check_mem_start.o - endif -+KBUILD_CFLAGS_KERNEL := $(patsubst -f%-sections,,$(KBUILD_CFLAGS_KERNEL)) - - # -fstack-protector-strong triggers protection checks in this code, - # but it is being used too early to link to meaningful stack_chk logic. ---- a/arch/arm/kernel/vmlinux.lds.S -+++ b/arch/arm/kernel/vmlinux.lds.S -@@ -75,7 +75,7 @@ SECTIONS - . = ALIGN(4); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; -- ARM_MMU_KEEP(*(__ex_table)) -+ KEEP(*(__ex_table)) - __stop___ex_table = .; - } - -@@ -100,24 +100,24 @@ SECTIONS - } - .init.arch.info : { - __arch_info_begin = .; -- *(.arch.info.init) -+ KEEP(*(.arch.info.init)) - __arch_info_end = .; - } - .init.tagtable : { - __tagtable_begin = .; -- *(.taglist.init) -+ KEEP(*(.taglist.init)) - __tagtable_end = .; - } - #ifdef CONFIG_SMP_ON_UP - .init.smpalt : { - __smpalt_begin = .; -- *(.alt.smp.init) -+ KEEP(*(.alt.smp.init)) - __smpalt_end = .; - } - #endif - .init.pv_table : { - __pv_table_begin = .; -- *(.pv_table) -+ KEEP(*(.pv_table)) - __pv_table_end = .; - } - ---- a/arch/arm/include/asm/vmlinux.lds.h -+++ b/arch/arm/include/asm/vmlinux.lds.h -@@ -29,13 +29,13 @@ - #define PROC_INFO \ - . = ALIGN(4); \ - __proc_info_begin = .; \ -- *(.proc.info.init) \ -+ KEEP(*(.proc.info.init)) \ - __proc_info_end = .; - - #define IDMAP_TEXT \ - ALIGN_FUNCTION(); \ - __idmap_text_start = .; \ -- *(.idmap.text) \ -+ KEEP(*(.idmap.text)) \ - __idmap_text_end = .; \ - - #define ARM_DISCARD \ -@@ -96,12 +96,12 @@ - . = ALIGN(8); \ - .ARM.unwind_idx : { \ - __start_unwind_idx = .; \ -- *(.ARM.exidx*) \ -+ KEEP(*(.ARM.exidx*)) \ - __stop_unwind_idx = .; \ - } \ - .ARM.unwind_tab : { \ - __start_unwind_tab = .; \ -- *(.ARM.extab*) \ -+ KEEP(*(.ARM.extab*)) \ - __stop_unwind_tab = .; \ - } - -@@ -112,14 +112,14 @@ - #define ARM_VECTORS \ - __vectors_start = .; \ - .vectors 0xffff0000 : AT(__vectors_start) { \ -- *(.vectors) \ -+ KEEP(*(.vectors)) \ - } \ - . = __vectors_start + SIZEOF(.vectors); \ - __vectors_end = .; \ - \ - __stubs_start = .; \ - .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) { \ -- *(.stubs) \ -+ KEEP(*(.stubs)) \ - } \ - . = __stubs_start + SIZEOF(.stubs); \ - __stubs_end = .; \ diff --git a/root/target/linux/generic/hack-5.15/221-module_exports.patch b/root/target/linux/generic/hack-5.15/221-module_exports.patch index 65cee1a5..0153d3a5 100755 --- a/root/target/linux/generic/hack-5.15/221-module_exports.patch +++ b/root/target/linux/generic/hack-5.15/221-module_exports.patch @@ -30,7 +30,7 @@ Signed-off-by: Felix Fietkau /* Align . to a 8 byte boundary equals to maximum function alignment. */ #define ALIGN_FUNCTION() . = ALIGN(8) -@@ -484,14 +494,14 @@ +@@ -486,14 +496,14 @@ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ __start___ksymtab = .; \ @@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau __stop___ksymtab_gpl = .; \ } \ \ -@@ -511,7 +521,7 @@ +@@ -513,7 +523,7 @@ \ /* Kernel symbol table: strings */ \ __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \ @@ -56,7 +56,7 @@ Signed-off-by: Felix Fietkau } \ \ /* __*init sections */ \ -@@ -1018,6 +1028,8 @@ +@@ -1009,6 +1019,8 @@ #define COMMON_DISCARDS \ SANITIZER_DISCARDS \ @@ -91,7 +91,7 @@ Signed-off-by: Felix Fietkau "__kstrtabns_" #sym ": \n" \ --- a/scripts/Makefile.build +++ b/scripts/Makefile.build -@@ -385,7 +385,7 @@ targets += $(real-dtb-y) $(lib-y) $(alwa +@@ -358,7 +358,7 @@ targets += $(real-dtb-y) $(lib-y) $(alwa # Linker scripts preprocessor (.lds.S -> .lds) # --------------------------------------------------------------------------- quiet_cmd_cpp_lds_S = LDS $@ diff --git a/root/target/linux/generic/hack-5.15/230-openwrt_lzma_options.patch b/root/target/linux/generic/hack-5.15/230-openwrt_lzma_options.patch index 8aa5b7c5..6bc5d1de 100755 --- a/root/target/linux/generic/hack-5.15/230-openwrt_lzma_options.patch +++ b/root/target/linux/generic/hack-5.15/230-openwrt_lzma_options.patch @@ -23,7 +23,7 @@ Signed-off-by: Imre Kaloz { {0x02, 0x21}, "lz4", unlz4 }, --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib -@@ -413,7 +413,7 @@ quiet_cmd_bzip2 = BZIP2 $@ +@@ -408,7 +408,7 @@ quiet_cmd_bzip2 = BZIP2 $@ # --------------------------------------------------------------------------- quiet_cmd_lzma = LZMA $@ diff --git a/root/target/linux/generic/hack-5.15/251-kconfig.patch b/root/target/linux/generic/hack-5.15/251-kconfig.patch index ea830272..004f18c0 100755 --- a/root/target/linux/generic/hack-5.15/251-kconfig.patch +++ b/root/target/linux/generic/hack-5.15/251-kconfig.patch @@ -92,7 +92,7 @@ Signed-off-by: John Crispin bool --- a/lib/Kconfig +++ b/lib/Kconfig -@@ -439,16 +439,16 @@ config BCH_CONST_T +@@ -433,16 +433,16 @@ config BCH_CONST_T # Textsearch support is select'ed if needed # config TEXTSEARCH diff --git a/root/target/linux/generic/hack-5.15/252-SATA_PMP.patch b/root/target/linux/generic/hack-5.15/252-SATA_PMP.patch deleted file mode 100755 index 6502d1d6..00000000 --- a/root/target/linux/generic/hack-5.15/252-SATA_PMP.patch +++ /dev/null @@ -1,23 +0,0 @@ -From 8c817e33be829c7249c2cfd59ff48ad5fac6a31d Mon Sep 17 00:00:00 2001 -From: Sungbo Eo -Date: Fri, 7 Jul 2017 17:09:21 +0200 -Subject: [PATCH] kconfig: solidify SATA_PMP config - -SATA_PMP option in kernel config file disappears for every kernel_oldconfig refresh. -To prevent this, SATA_HOST is now selected automatically when SATA_PMP is enabled. -This patch can be dropped if SATA_MV is ever re-added into the config. ---- - drivers/ata/Kconfig | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/drivers/ata/Kconfig -+++ b/drivers/ata/Kconfig -@@ -112,7 +112,7 @@ config SATA_ZPODD - - config SATA_PMP - bool "SATA Port Multiplier support" -- depends on SATA_HOST -+ select SATA_HOST - default y - help - This option adds support for SATA Port Multipliers diff --git a/root/target/linux/generic/hack-5.15/259-regmap_dynamic.patch b/root/target/linux/generic/hack-5.15/259-regmap_dynamic.patch deleted file mode 100755 index d1d56a11..00000000 --- a/root/target/linux/generic/hack-5.15/259-regmap_dynamic.patch +++ /dev/null @@ -1,144 +0,0 @@ -From 811d9e2268a62b830cfe93cd8bc929afcb8b198b Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 15 Jul 2017 21:12:38 +0200 -Subject: kernel: move regmap bloat out of the kernel image if it is only being used in modules - -lede-commit: 96f39119815028073583e4fca3a9c5fe9141e998 -Signed-off-by: Felix Fietkau ---- - drivers/base/regmap/Kconfig | 15 ++++++++++----- - drivers/base/regmap/Makefile | 12 ++++++++---- - drivers/base/regmap/regmap.c | 3 +++ - include/linux/regmap.h | 2 +- - 4 files changed, 22 insertions(+), 10 deletions(-) - ---- a/drivers/base/regmap/Kconfig -+++ b/drivers/base/regmap/Kconfig -@@ -4,10 +4,9 @@ - # subsystems should select the appropriate symbols. - - config REGMAP -- default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SOUNDWIRE || REGMAP_SOUNDWIRE_MBQ || REGMAP_SCCB || REGMAP_I3C || REGMAP_SPI_AVMM || REGMAP_MDIO) - select IRQ_DOMAIN if REGMAP_IRQ - select MDIO_BUS if REGMAP_MDIO -- bool -+ tristate - - config REGCACHE_COMPRESSED - select LZO_COMPRESS -@@ -15,53 +14,67 @@ config REGCACHE_COMPRESSED - bool - - config REGMAP_AC97 -+ select REGMAP - tristate - - config REGMAP_I2C -+ select REGMAP - tristate - depends on I2C - - config REGMAP_SLIMBUS -+ select REGMAP - tristate - depends on SLIMBUS - - config REGMAP_SPI -+ select REGMAP - tristate - depends on SPI - - config REGMAP_SPMI -+ select REGMAP - tristate - depends on SPMI - - config REGMAP_W1 -+ select REGMAP - tristate - depends on W1 - - config REGMAP_MDIO -+ select REGMAP - tristate - - config REGMAP_MMIO -+ select REGMAP - tristate - - config REGMAP_IRQ -+ select REGMAP - bool - - config REGMAP_SOUNDWIRE -+ select REGMAP - tristate - depends on SOUNDWIRE - - config REGMAP_SOUNDWIRE_MBQ -+ select REGMAP - tristate - depends on SOUNDWIRE - - config REGMAP_SCCB -+ select REGMAP - tristate - depends on I2C - - config REGMAP_I3C -+ select REGMAP - tristate - depends on I3C - - config REGMAP_SPI_AVMM -+ select REGMAP - tristate - depends on SPI ---- a/drivers/base/regmap/Makefile -+++ b/drivers/base/regmap/Makefile -@@ -2,10 +2,14 @@ - # For include/trace/define_trace.h to include trace.h - CFLAGS_regmap.o := -I$(src) - --obj-$(CONFIG_REGMAP) += regmap.o regcache.o --obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-flat.o --obj-$(CONFIG_REGCACHE_COMPRESSED) += regcache-lzo.o --obj-$(CONFIG_DEBUG_FS) += regmap-debugfs.o -+regmap-core-objs = regmap.o regcache.o regcache-rbtree.o regcache-flat.o -+ifdef CONFIG_DEBUG_FS -+regmap-core-objs += regmap-debugfs.o -+endif -+ifdef CONFIG_REGCACHE_COMPRESSED -+regmap-core-objs += regcache-lzo.o -+endif -+obj-$(CONFIG_REGMAP) += regmap-core.o - obj-$(CONFIG_REGMAP_AC97) += regmap-ac97.o - obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o - obj-$(CONFIG_REGMAP_SLIMBUS) += regmap-slimbus.o ---- a/drivers/base/regmap/regmap.c -+++ b/drivers/base/regmap/regmap.c -@@ -9,6 +9,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -3339,3 +3340,5 @@ static int __init regmap_initcall(void) - return 0; - } - postcore_initcall(regmap_initcall); -+ -+MODULE_LICENSE("GPL"); ---- a/include/linux/regmap.h -+++ b/include/linux/regmap.h -@@ -180,7 +180,7 @@ struct reg_sequence { - __ret ?: __tmp; \ - }) - --#ifdef CONFIG_REGMAP -+#if IS_REACHABLE(CONFIG_REGMAP) - - enum regmap_endian { - /* Unspecified -> 0 -> Backwards compatible default */ diff --git a/root/target/linux/generic/hack-5.15/301-mips_image_cmdline_hack.patch b/root/target/linux/generic/hack-5.15/301-mips_image_cmdline_hack.patch index 15e233ac..993b1e6f 100755 --- a/root/target/linux/generic/hack-5.15/301-mips_image_cmdline_hack.patch +++ b/root/target/linux/generic/hack-5.15/301-mips_image_cmdline_hack.patch @@ -10,7 +10,7 @@ Signed-off-by: Gabor Juhos --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig -@@ -1180,6 +1180,10 @@ config MIPS_MSC +@@ -1202,6 +1202,10 @@ config MIPS_MSC config SYNC_R4K bool diff --git a/root/target/linux/generic/hack-5.15/401-mtd-super-don-t-reply-on-mtdblock-device-minor.patch b/root/target/linux/generic/hack-5.15/401-mtd-super-don-t-reply-on-mtdblock-device-minor.patch deleted file mode 100755 index 8f985c0b..00000000 --- a/root/target/linux/generic/hack-5.15/401-mtd-super-don-t-reply-on-mtdblock-device-minor.patch +++ /dev/null @@ -1,84 +0,0 @@ -From f9760b158f610b1792a222cc924073724c061bfb Mon Sep 17 00:00:00 2001 -From: Daniel Golle -Date: Wed, 7 Apr 2021 22:37:57 +0100 -Subject: [PATCH 1/2] mtd: super: don't reply on mtdblock device minor -To: linux-mtd@lists.infradead.org -Cc: Vignesh Raghavendra , - Richard Weinberger , - Miquel Raynal , - David Woodhouse - -For blktrans devices with partitions (ie. part_bits != 0) the -assumption that the minor number of the mtdblock device matches -the mtdnum doesn't hold true. -Properly resolve mtd device from blktrans layer instead. - -Signed-off-by: Daniel Golle ---- - drivers/mtd/mtdsuper.c | 33 ++++++++++++++++++++++++++------- - 1 file changed, 26 insertions(+), 7 deletions(-) - ---- a/drivers/mtd/mtdsuper.c -+++ b/drivers/mtd/mtdsuper.c -@@ -9,6 +9,7 @@ - */ - - #include -+#include - #include - #include - #include -@@ -120,8 +121,9 @@ int get_tree_mtd(struct fs_context *fc, - struct fs_context *fc)) - { - #ifdef CONFIG_BLOCK -- dev_t dev; -- int ret; -+ struct mtd_blktrans_dev *blktrans_dev; -+ struct block_device *bdev; -+ int ret, part_bits; - #endif - int mtdnr; - -@@ -169,16 +171,36 @@ int get_tree_mtd(struct fs_context *fc, - /* try the old way - the hack where we allowed users to mount - * /dev/mtdblock$(n) but didn't actually _use_ the blockdev - */ -- ret = lookup_bdev(fc->source, &dev); -- if (ret) { -+ bdev = blkdev_get_by_path(fc->source, FMODE_READ, NULL); -+ if (IS_ERR(bdev)) { -+ ret = PTR_ERR(bdev); - errorf(fc, "MTD: Couldn't look up '%s': %d", fc->source, ret); - return ret; - } -- pr_debug("MTDSB: lookup_bdev() returned 0\n"); -+ pr_debug("MTDSB: blkdev_get_by_path() returned 0\n"); - -- if (MAJOR(dev) == MTD_BLOCK_MAJOR) -- return mtd_get_sb_by_nr(fc, MINOR(dev), fill_super); -+ if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { -+ if (!bdev->bd_disk) { -+ blkdev_put(bdev, FMODE_READ); -+ BUG(); -+ return -EINVAL; -+ } - -+ blktrans_dev = (struct mtd_blktrans_dev *)(bdev->bd_disk->private_data); -+ if (!blktrans_dev || !blktrans_dev->tr) { -+ blkdev_put(bdev, FMODE_READ); -+ BUG(); -+ return -EINVAL; -+ } -+ mtdnr = blktrans_dev->devnum; -+ part_bits = blktrans_dev->tr->part_bits; -+ blkdev_put(bdev, FMODE_READ); -+ if (MINOR(bdev->bd_dev) != (mtdnr << part_bits)) -+ return -EINVAL; -+ -+ return mtd_get_sb_by_nr(fc, mtdnr, fill_super); -+ } -+ blkdev_put(bdev, FMODE_READ); - #endif /* CONFIG_BLOCK */ - - if (!(fc->sb_flags & SB_SILENT)) diff --git a/root/target/linux/generic/hack-5.15/402-mtd-blktrans-call-add-disks-after-mtd-device.patch b/root/target/linux/generic/hack-5.15/402-mtd-blktrans-call-add-disks-after-mtd-device.patch deleted file mode 100755 index c9821b57..00000000 --- a/root/target/linux/generic/hack-5.15/402-mtd-blktrans-call-add-disks-after-mtd-device.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 0bccc3722bdd88e8ae995e77ef9f7b77ee4cbdee Mon Sep 17 00:00:00 2001 -From: Daniel Golle -Date: Wed, 7 Apr 2021 22:45:54 +0100 -Subject: [PATCH 2/2] mtd: blktrans: call add disks after mtd device -To: linux-mtd@lists.infradead.org -Cc: Vignesh Raghavendra , - Richard Weinberger , - Miquel Raynal , - David Woodhouse - -Calling device_add_disk while holding mtd_table_mutex leads -to deadlock in case part_bits!=0 as block partition parsers -will try to open the newly created disks, trying to acquire -mutex once again. -Move device_add_disk to additional function called after -add partitions of an MTD device have been added and locks -have been released. - -Signed-off-by: Daniel Golle ---- - drivers/mtd/mtd_blkdevs.c | 33 ++++++++++++++++++++++++++------- - drivers/mtd/mtdcore.c | 3 +++ - include/linux/mtd/blktrans.h | 1 + - 3 files changed, 30 insertions(+), 7 deletions(-) - ---- a/drivers/mtd/mtd_blkdevs.c -+++ b/drivers/mtd/mtd_blkdevs.c -@@ -384,13 +384,6 @@ int add_mtd_blktrans_dev(struct mtd_blkt - if (new->readonly) - set_disk_ro(gd, 1); - -- device_add_disk(&new->mtd->dev, gd, NULL); -- -- if (new->disk_attributes) { -- ret = sysfs_create_group(&disk_to_dev(gd)->kobj, -- new->disk_attributes); -- WARN_ON(ret); -- } - return 0; - - out_free_tag_set: -@@ -402,6 +395,27 @@ out_list_del: - return ret; - } - -+void register_mtd_blktrans_devs(void) -+{ -+ struct mtd_blktrans_ops *tr; -+ struct mtd_blktrans_dev *dev, *next; -+ int ret; -+ -+ list_for_each_entry(tr, &blktrans_majors, list) { -+ list_for_each_entry_safe(dev, next, &tr->devs, list) { -+ if (disk_live(dev->disk)) -+ continue; -+ -+ device_add_disk(&dev->mtd->dev, dev->disk, NULL); -+ if (dev->disk_attributes) { -+ ret = sysfs_create_group(&disk_to_dev(dev->disk)->kobj, -+ dev->disk_attributes); -+ WARN_ON(ret); -+ } -+ } -+ } -+} -+ - int del_mtd_blktrans_dev(struct mtd_blktrans_dev *old) - { - unsigned long flags; ---- a/drivers/mtd/mtdcore.c -+++ b/drivers/mtd/mtdcore.c -@@ -31,6 +31,7 @@ - - #include - #include -+#include - - #include "mtdcore.h" - -@@ -1000,6 +1001,8 @@ int mtd_device_parse_register(struct mtd - - ret = mtd_otp_nvmem_add(mtd); - -+ register_mtd_blktrans_devs(); -+ - out: - if (ret && device_is_registered(&mtd->dev)) - del_mtd_device(mtd); ---- a/include/linux/mtd/blktrans.h -+++ b/include/linux/mtd/blktrans.h -@@ -76,6 +76,7 @@ extern int deregister_mtd_blktrans(struc - extern int add_mtd_blktrans_dev(struct mtd_blktrans_dev *dev); - extern int del_mtd_blktrans_dev(struct mtd_blktrans_dev *dev); - extern int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev); -+extern void register_mtd_blktrans_devs(void); - - /** - * module_mtd_blktrans() - Helper macro for registering a mtd blktrans driver diff --git a/root/target/linux/generic/hack-5.15/410-block-fit-partition-parser.patch b/root/target/linux/generic/hack-5.15/410-block-fit-partition-parser.patch deleted file mode 100755 index 2ac6cb03..00000000 --- a/root/target/linux/generic/hack-5.15/410-block-fit-partition-parser.patch +++ /dev/null @@ -1,220 +0,0 @@ ---- a/block/blk.h -+++ b/block/blk.h -@@ -354,6 +354,7 @@ void blk_free_ext_minor(unsigned int min - #define ADDPART_FLAG_NONE 0 - #define ADDPART_FLAG_RAID 1 - #define ADDPART_FLAG_WHOLEDISK 2 -+#define ADDPART_FLAG_ROOTDEV 4 - int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, - sector_t length); - int bdev_del_partition(struct gendisk *disk, int partno); ---- a/block/partitions/Kconfig -+++ b/block/partitions/Kconfig -@@ -101,6 +101,13 @@ config ATARI_PARTITION - Say Y here if you would like to use hard disks under Linux which - were partitioned under the Atari OS. - -+config FIT_PARTITION -+ bool "Flattened-Image-Tree (FIT) partition support" if PARTITION_ADVANCED -+ default n -+ help -+ Say Y here if your system needs to mount the filesystem part of -+ a Flattened-Image-Tree (FIT) image commonly used with Das U-Boot. -+ - config IBM_PARTITION - bool "IBM disk label and partition support" - depends on PARTITION_ADVANCED && S390 ---- a/block/partitions/Makefile -+++ b/block/partitions/Makefile -@@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o - obj-$(CONFIG_AMIGA_PARTITION) += amiga.o - obj-$(CONFIG_ATARI_PARTITION) += atari.o - obj-$(CONFIG_AIX_PARTITION) += aix.o -+obj-$(CONFIG_FIT_PARTITION) += fit.o - obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o - obj-$(CONFIG_MAC_PARTITION) += mac.o - obj-$(CONFIG_LDM_PARTITION) += ldm.o ---- a/block/partitions/check.h -+++ b/block/partitions/check.h -@@ -58,6 +58,7 @@ int amiga_partition(struct parsed_partit - int atari_partition(struct parsed_partitions *state); - int cmdline_partition(struct parsed_partitions *state); - int efi_partition(struct parsed_partitions *state); -+int fit_partition(struct parsed_partitions *state); - int ibm_partition(struct parsed_partitions *); - int karma_partition(struct parsed_partitions *state); - int ldm_partition(struct parsed_partitions *state); -@@ -68,3 +69,5 @@ int sgi_partition(struct parsed_partitio - int sun_partition(struct parsed_partitions *state); - int sysv68_partition(struct parsed_partitions *state); - int ultrix_partition(struct parsed_partitions *state); -+ -+int parse_fit_partitions(struct parsed_partitions *state, u64 start_sector, u64 nr_sectors, int *slot, int add_remain); ---- a/block/partitions/core.c -+++ b/block/partitions/core.c -@@ -11,6 +11,10 @@ - #include - #include - #include -+#ifdef CONFIG_FIT_PARTITION -+#include -+#endif -+ - #include "check.h" - - static int (*check_part[])(struct parsed_partitions *) = { -@@ -47,6 +51,9 @@ static int (*check_part[])(struct parsed - #ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ - #endif -+#ifdef CONFIG_FIT_PARTITION -+ fit_partition, -+#endif - #ifdef CONFIG_SGI_PARTITION - sgi_partition, - #endif -@@ -597,6 +604,11 @@ static bool blk_add_partition(struct gen - (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part->bd_dev); - -+#ifdef CONFIG_FIT_PARTITION -+ if ((state->parts[p].flags & ADDPART_FLAG_ROOTDEV) && ROOT_DEV == 0) -+ ROOT_DEV = part_to_dev(part)->devt; -+#endif -+ - return true; - } - ---- a/drivers/mtd/ubi/block.c -+++ b/drivers/mtd/ubi/block.c -@@ -419,7 +419,11 @@ int ubiblock_create(struct ubi_volume_in - - gd->fops = &ubiblock_ops; - gd->major = ubiblock_major; -+#ifdef CONFIG_FIT_PARTITION -+ gd->minors = 0; -+#else - gd->minors = 1; -+#endif - gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL); - if (gd->first_minor < 0) { - dev_err(disk_to_dev(gd), -@@ -428,6 +432,9 @@ int ubiblock_create(struct ubi_volume_in - goto out_cleanup_disk; - } - gd->private_data = dev; -+#ifdef CONFIG_FIT_PARTITION -+ gd->flags |= GENHD_FL_EXT_DEVT; -+#endif - sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id); - set_capacity(gd, disk_capacity); - dev->gd = gd; ---- a/block/partitions/efi.c -+++ b/block/partitions/efi.c -@@ -716,6 +716,9 @@ int efi_partition(struct parsed_partitio - gpt_entry *ptes = NULL; - u32 i; - unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; -+#ifdef CONFIG_FIT_PARTITION -+ u32 extra_slot = 64; -+#endif - - if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { - kfree(gpt); -@@ -749,6 +752,11 @@ int efi_partition(struct parsed_partitio - ARRAY_SIZE(ptes[i].partition_name)); - utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname); - state->parts[i + 1].has_info = true; -+#ifdef CONFIG_FIT_PARTITION -+ /* If this is a U-Boot FIT volume it may have subpartitions */ -+ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_FIT_GUID)) -+ (void) parse_fit_partitions(state, start * ssz, size * ssz, &extra_slot, 1); -+#endif - } - kfree(ptes); - kfree(gpt); ---- a/block/partitions/efi.h -+++ b/block/partitions/efi.h -@@ -52,6 +52,9 @@ - #define PARTITION_LINUX_LVM_GUID \ - EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ - 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) -+#define PARTITION_LINUX_FIT_GUID \ -+ EFI_GUID( 0xcae9be83, 0xb15f, 0x49cc, \ -+ 0x86, 0x3f, 0x08, 0x1b, 0x74, 0x4a, 0x2d, 0x93) - - typedef struct _gpt_header { - __le64 signature; ---- a/drivers/mtd/mtdblock.c -+++ b/drivers/mtd/mtdblock.c -@@ -338,7 +338,11 @@ static void mtdblock_remove_dev(struct m - static struct mtd_blktrans_ops mtdblock_tr = { - .name = "mtdblock", - .major = MTD_BLOCK_MAJOR, -+#ifdef CONFIG_FIT_PARTITION -+ .part_bits = 1, -+#else - .part_bits = 0, -+#endif - .blksize = 512, - .open = mtdblock_open, - .flush = mtdblock_flush, ---- a/drivers/mtd/mtd_blkdevs.c -+++ b/drivers/mtd/mtd_blkdevs.c -@@ -346,18 +346,8 @@ int add_mtd_blktrans_dev(struct mtd_blkt - gd->minors = 1 << tr->part_bits; - gd->fops = &mtd_block_ops; - -- if (tr->part_bits) -- if (new->devnum < 26) -- snprintf(gd->disk_name, sizeof(gd->disk_name), -- "%s%c", tr->name, 'a' + new->devnum); -- else -- snprintf(gd->disk_name, sizeof(gd->disk_name), -- "%s%c%c", tr->name, -- 'a' - 1 + new->devnum / 26, -- 'a' + new->devnum % 26); -- else -- snprintf(gd->disk_name, sizeof(gd->disk_name), -- "%s%d", tr->name, new->devnum); -+ snprintf(gd->disk_name, sizeof(gd->disk_name), -+ "%s%d", tr->name, new->devnum); - - set_capacity(gd, ((u64)new->size * tr->blksize) >> 9); - ---- a/block/partitions/msdos.c -+++ b/block/partitions/msdos.c -@@ -564,6 +564,15 @@ static void parse_minix(struct parsed_pa - #endif /* CONFIG_MINIX_SUBPARTITION */ - } - -+static void parse_fit_mbr(struct parsed_partitions *state, -+ sector_t offset, sector_t size, int origin) -+{ -+#ifdef CONFIG_FIT_PARTITION -+ u32 extra_slot = 64; -+ (void) parse_fit_partitions(state, offset, size, &extra_slot, 1); -+#endif /* CONFIG_FIT_PARTITION */ -+} -+ - static struct { - unsigned char id; - void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); -@@ -575,6 +584,7 @@ static struct { - {UNIXWARE_PARTITION, parse_unixware}, - {SOLARIS_X86_PARTITION, parse_solaris_x86}, - {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, -+ {FIT_PARTITION, parse_fit_mbr}, - {0, NULL}, - }; - ---- a/include/linux/msdos_partition.h -+++ b/include/linux/msdos_partition.h -@@ -31,6 +31,7 @@ enum msdos_sys_ind { - LINUX_LVM_PARTITION = 0x8e, - LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ - -+ FIT_PARTITION = 0x2e, /* U-Boot uImage.FIT */ - SOLARIS_X86_PARTITION = 0x82, /* also Linux swap partitions */ - NEW_SOLARIS_X86_PARTITION = 0xbf, - diff --git a/root/target/linux/generic/hack-5.15/420-mtd-set-rootfs-to-be-root-dev.patch b/root/target/linux/generic/hack-5.15/420-mtd-set-rootfs-to-be-root-dev.patch index aa1d4df0..91a91b36 100755 --- a/root/target/linux/generic/hack-5.15/420-mtd-set-rootfs-to-be-root-dev.patch +++ b/root/target/linux/generic/hack-5.15/420-mtd-set-rootfs-to-be-root-dev.patch @@ -20,7 +20,7 @@ Signed-off-by: Gabor Juhos #include #include -@@ -696,6 +697,19 @@ int add_mtd_device(struct mtd_info *mtd) +@@ -694,6 +695,19 @@ int add_mtd_device(struct mtd_info *mtd) of this try_ nonsense, and no bitching about it either. :) */ __module_get(THIS_MODULE); diff --git a/root/target/linux/generic/hack-5.15/640-bridge-only-accept-EAP-locally.patch b/root/target/linux/generic/hack-5.15/640-bridge-only-accept-EAP-locally.patch index 15c1e342..29a4f7f3 100755 --- a/root/target/linux/generic/hack-5.15/640-bridge-only-accept-EAP-locally.patch +++ b/root/target/linux/generic/hack-5.15/640-bridge-only-accept-EAP-locally.patch @@ -12,7 +12,7 @@ Signed-off-by: Etienne Champetier --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c -@@ -108,10 +108,14 @@ int br_handle_frame_finish(struct net *n +@@ -103,10 +103,14 @@ int br_handle_frame_finish(struct net *n } } @@ -30,7 +30,7 @@ Signed-off-by: Etienne Champetier if (IS_ENABLED(CONFIG_INET) && --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h -@@ -468,6 +468,8 @@ struct net_bridge { +@@ -402,6 +402,8 @@ struct net_bridge { u16 group_fwd_mask; u16 group_fwd_mask_required; @@ -39,45 +39,3 @@ Signed-off-by: Etienne Champetier /* STP */ bridge_id designated_root; bridge_id bridge_id; ---- a/net/bridge/br_sysfs_br.c -+++ b/net/bridge/br_sysfs_br.c -@@ -197,6 +197,31 @@ static ssize_t group_fwd_mask_store(stru - } - static DEVICE_ATTR_RW(group_fwd_mask); - -+static ssize_t disable_eap_hack_show(struct device *d, -+ struct device_attribute *attr, -+ char *buf) -+{ -+ struct net_bridge *br = to_bridge(d); -+ return sprintf(buf, "%u\n", br->disable_eap_hack); -+} -+ -+static int set_disable_eap_hack(struct net_bridge *br, unsigned long val, -+ struct netlink_ext_ack *extack) -+{ -+ br->disable_eap_hack = !!val; -+ -+ return 0; -+} -+ -+static ssize_t disable_eap_hack_store(struct device *d, -+ struct device_attribute *attr, -+ const char *buf, -+ size_t len) -+{ -+ return store_bridge_parm(d, buf, len, set_disable_eap_hack); -+} -+static DEVICE_ATTR_RW(disable_eap_hack); -+ - static ssize_t priority_show(struct device *d, struct device_attribute *attr, - char *buf) - { -@@ -937,6 +962,7 @@ static struct attribute *bridge_attrs[] - &dev_attr_ageing_time.attr, - &dev_attr_stp_state.attr, - &dev_attr_group_fwd_mask.attr, -+ &dev_attr_disable_eap_hack.attr, - &dev_attr_priority.attr, - &dev_attr_bridge_id.attr, - &dev_attr_root_id.attr, diff --git a/root/target/linux/generic/hack-5.15/650-netfilter-add-xt_FLOWOFFLOAD-target.patch b/root/target/linux/generic/hack-5.15/650-netfilter-add-xt_FLOWOFFLOAD-target.patch index b48f981f..c303114c 100755 --- a/root/target/linux/generic/hack-5.15/650-netfilter-add-xt_FLOWOFFLOAD-target.patch +++ b/root/target/linux/generic/hack-5.15/650-netfilter-add-xt_FLOWOFFLOAD-target.patch @@ -98,7 +98,7 @@ Signed-off-by: Felix Fietkau obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o --- /dev/null +++ b/net/netfilter/xt_FLOWOFFLOAD.c -@@ -0,0 +1,656 @@ +@@ -0,0 +1,658 @@ +/* + * Copyright (C) 2018-2021 Felix Fietkau + * @@ -278,6 +278,8 @@ Signed-off-by: Felix Fietkau + hook->used = true; + } + spin_unlock_bh(&hooks_lock); ++ ++ cond_resched(); +} + +static void @@ -765,7 +767,7 @@ Signed-off-by: Felix Fietkau #include #include #include -@@ -397,8 +396,7 @@ flow_offload_lookup(struct nf_flowtable +@@ -407,8 +406,7 @@ flow_offload_lookup(struct nf_flowtable } EXPORT_SYMBOL_GPL(flow_offload_lookup); @@ -775,7 +777,7 @@ Signed-off-by: Felix Fietkau void (*iter)(struct flow_offload *flow, void *data), void *data) { -@@ -430,6 +428,7 @@ nf_flow_table_iterate(struct nf_flowtabl +@@ -440,6 +438,7 @@ nf_flow_table_iterate(struct nf_flowtabl return err; } diff --git a/root/target/linux/generic/hack-5.15/651-wireless_mesh_header.patch b/root/target/linux/generic/hack-5.15/651-wireless_mesh_header.patch index 12a031ec..0639ad4e 100755 --- a/root/target/linux/generic/hack-5.15/651-wireless_mesh_header.patch +++ b/root/target/linux/generic/hack-5.15/651-wireless_mesh_header.patch @@ -11,7 +11,7 @@ Signed-off-by: Imre Kaloz --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h -@@ -145,8 +145,8 @@ static inline bool dev_xmit_complete(int +@@ -144,8 +144,8 @@ static inline bool dev_xmit_complete(int #if defined(CONFIG_HYPERV_NET) # define LL_MAX_HEADER 128 diff --git a/root/target/linux/generic/hack-5.15/661-use_fq_codel_by_default.patch b/root/target/linux/generic/hack-5.15/661-use_fq_codel_by_default.patch index 35dbe426..c4168e2a 100755 --- a/root/target/linux/generic/hack-5.15/661-use_fq_codel_by_default.patch +++ b/root/target/linux/generic/hack-5.15/661-use_fq_codel_by_default.patch @@ -14,7 +14,7 @@ Signed-off-by: Felix Fietkau --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h -@@ -626,12 +626,13 @@ extern struct Qdisc_ops noop_qdisc_ops; +@@ -624,12 +624,13 @@ extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; diff --git a/root/target/linux/generic/hack-5.15/710-net-dsa-mv88e6xxx-default-VID-1.patch b/root/target/linux/generic/hack-5.15/710-net-dsa-mv88e6xxx-default-VID-1.patch index d0cefbfb..3c5d1b1d 100755 --- a/root/target/linux/generic/hack-5.15/710-net-dsa-mv88e6xxx-default-VID-1.patch +++ b/root/target/linux/generic/hack-5.15/710-net-dsa-mv88e6xxx-default-VID-1.patch @@ -1,6 +1,6 @@ --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c -@@ -2317,6 +2317,7 @@ static int mv88e6xxx_port_fdb_add(struct +@@ -2225,6 +2225,7 @@ static int mv88e6xxx_port_fdb_add(struct struct mv88e6xxx_chip *chip = ds->priv; int err; @@ -8,7 +8,7 @@ mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC); -@@ -2331,6 +2332,7 @@ static int mv88e6xxx_port_fdb_del(struct +@@ -2239,6 +2240,7 @@ static int mv88e6xxx_port_fdb_del(struct struct mv88e6xxx_chip *chip = ds->priv; int err; diff --git a/root/target/linux/generic/hack-5.15/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch b/root/target/linux/generic/hack-5.15/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch index b94851cf..95b3894b 100755 --- a/root/target/linux/generic/hack-5.15/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch +++ b/root/target/linux/generic/hack-5.15/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch @@ -1,6 +1,6 @@ --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c -@@ -2979,6 +2979,9 @@ static int mv88e6xxx_setup_port(struct m +@@ -2817,6 +2817,9 @@ static int mv88e6xxx_setup_port(struct m else reg = 1 << port; diff --git a/root/target/linux/generic/hack-5.15/720-net-phy-add-aqr-phys.patch b/root/target/linux/generic/hack-5.15/720-net-phy-add-aqr-phys.patch deleted file mode 100755 index 59124990..00000000 --- a/root/target/linux/generic/hack-5.15/720-net-phy-add-aqr-phys.patch +++ /dev/null @@ -1,142 +0,0 @@ -From: Birger Koblitz -Date: Sun, 5 Sep 2021 15:13:10 +0200 -Subject: [PATCH] kernel: Add AQR113C and AQR813 support - -This hack adds support for the Aquantia 4th generation, 10GBit -PHYs AQR113C and AQR813. - -Signed-off-by: Birger Koblitz - ---- a/drivers/net/phy/aquantia_main.c -+++ b/drivers/net/phy/aquantia_main.c -@@ -20,8 +20,10 @@ - #define PHY_ID_AQR105 0x03a1b4a2 - #define PHY_ID_AQR106 0x03a1b4d0 - #define PHY_ID_AQR107 0x03a1b4e0 -+#define PHY_ID_AQR113C 0x31c31c12 - #define PHY_ID_AQCS109 0x03a1b5c2 - #define PHY_ID_AQR405 0x03a1b4b0 -+#define PHY_ID_AQR813 0x31c31cb2 - - #define MDIO_PHYXS_VEND_IF_STATUS 0xe812 - #define MDIO_PHYXS_VEND_IF_STATUS_TYPE_MASK GENMASK(7, 3) -@@ -359,6 +361,49 @@ static int aqr107_read_rate(struct phy_d - return 0; - } - -+static int aqr113c_read_status(struct phy_device *phydev) -+{ -+ int val, ret; -+ -+ ret = aqr_read_status(phydev); -+ if (ret) -+ return ret; -+ -+ if (!phydev->link || phydev->autoneg == AUTONEG_DISABLE) -+ return 0; -+ -+ // On AQR113C, the speed returned by aqr_read_status is wrong -+ aqr107_read_rate(phydev); -+ -+ val = phy_read_mmd(phydev, MDIO_MMD_PHYXS, MDIO_PHYXS_VEND_IF_STATUS); -+ if (val < 0) -+ return val; -+ -+ switch (FIELD_GET(MDIO_PHYXS_VEND_IF_STATUS_TYPE_MASK, val)) { -+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_KR: -+ phydev->interface = PHY_INTERFACE_MODE_10GKR; -+ break; -+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_XFI: -+ phydev->interface = PHY_INTERFACE_MODE_10GBASER; -+ break; -+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_USXGMII: -+ phydev->interface = PHY_INTERFACE_MODE_USXGMII; -+ break; -+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_SGMII: -+ phydev->interface = PHY_INTERFACE_MODE_SGMII; -+ break; -+ case MDIO_PHYXS_VEND_IF_STATUS_TYPE_OCSGMII: -+ phydev->interface = PHY_INTERFACE_MODE_2500BASEX; -+ break; -+ default: -+ phydev->interface = PHY_INTERFACE_MODE_NA; -+ break; -+ } -+ -+ /* Read downshifted rate from vendor register */ -+ return aqr107_read_rate(phydev); -+} -+ - static int aqr107_read_status(struct phy_device *phydev) - { - int val, ret; -@@ -489,7 +534,7 @@ static void aqr107_chip_info(struct phy_ - build_id = FIELD_GET(VEND1_GLOBAL_RSVD_STAT1_FW_BUILD_ID, val); - prov_id = FIELD_GET(VEND1_GLOBAL_RSVD_STAT1_PROV_ID, val); - -- phydev_dbg(phydev, "FW %u.%u, Build %u, Provisioning %u\n", -+ phydev_info(phydev, "FW %u.%u, Build %u, Provisioning %u\n", - fw_major, fw_minor, build_id, prov_id); - } - -@@ -661,6 +706,24 @@ static struct phy_driver aqr_driver[] = - .link_change_notify = aqr107_link_change_notify, - }, - { -+ PHY_ID_MATCH_MODEL(PHY_ID_AQR113C), -+ .name = "Aquantia AQR113C", -+ .probe = aqr107_probe, -+ .config_init = aqr107_config_init, -+ .config_aneg = aqr_config_aneg, -+ .config_intr = aqr_config_intr, -+ .handle_interrupt = aqr_handle_interrupt, -+ .read_status = aqr113c_read_status, -+ .get_tunable = aqr107_get_tunable, -+ .set_tunable = aqr107_set_tunable, -+ .suspend = aqr107_suspend, -+ .resume = aqr107_resume, -+ .get_sset_count = aqr107_get_sset_count, -+ .get_strings = aqr107_get_strings, -+ .get_stats = aqr107_get_stats, -+ .link_change_notify = aqr107_link_change_notify, -+}, -+{ - PHY_ID_MATCH_MODEL(PHY_ID_AQCS109), - .name = "Aquantia AQCS109", - .probe = aqr107_probe, -@@ -686,6 +749,24 @@ static struct phy_driver aqr_driver[] = - .handle_interrupt = aqr_handle_interrupt, - .read_status = aqr_read_status, - }, -+{ -+ PHY_ID_MATCH_MODEL(PHY_ID_AQR813), -+ .name = "Aquantia AQR813", -+ .probe = aqr107_probe, -+ .config_init = aqr107_config_init, -+ .config_aneg = aqr_config_aneg, -+ .config_intr = aqr_config_intr, -+ .handle_interrupt = aqr_handle_interrupt, -+ .read_status = aqr113c_read_status, -+ .get_tunable = aqr107_get_tunable, -+ .set_tunable = aqr107_set_tunable, -+ .suspend = aqr107_suspend, -+ .resume = aqr107_resume, -+ .get_sset_count = aqr107_get_sset_count, -+ .get_strings = aqr107_get_strings, -+ .get_stats = aqr107_get_stats, -+ .link_change_notify = aqr107_link_change_notify, -+}, - }; - - module_phy_driver(aqr_driver); -@@ -696,8 +777,10 @@ static struct mdio_device_id __maybe_unu - { PHY_ID_MATCH_MODEL(PHY_ID_AQR105) }, - { PHY_ID_MATCH_MODEL(PHY_ID_AQR106) }, - { PHY_ID_MATCH_MODEL(PHY_ID_AQR107) }, -+ { PHY_ID_MATCH_MODEL(PHY_ID_AQR113C) }, - { PHY_ID_MATCH_MODEL(PHY_ID_AQCS109) }, - { PHY_ID_MATCH_MODEL(PHY_ID_AQR405) }, -+ { PHY_ID_MATCH_MODEL(PHY_ID_AQR813) }, - { } - }; - diff --git a/root/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch b/root/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch deleted file mode 100755 index cea52fdc..00000000 --- a/root/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch +++ /dev/null @@ -1,178 +0,0 @@ -From ffe387740bbe88dd88bbe04d6375902708003d6e Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:25:00 +0200 -Subject: net: add packet mangeling - -ar8216 switches have a hardware bug, which renders normal 802.1q support -unusable. Packet mangling is required to fix up the vlan for incoming -packets. - -Signed-off-by: Felix Fietkau ---- - include/linux/netdevice.h | 11 +++++++++++ - include/linux/skbuff.h | 14 ++++---------- - net/Kconfig | 6 ++++++ - net/core/dev.c | 20 +++++++++++++++----- - net/core/skbuff.c | 17 +++++++++++++++++ - net/ethernet/eth.c | 6 ++++++ - 6 files changed, 59 insertions(+), 15 deletions(-) - ---- a/include/linux/netdevice.h -+++ b/include/linux/netdevice.h -@@ -1648,6 +1648,10 @@ enum netdev_priv_flags { - IFF_TX_SKB_NO_LINEAR = 1<<31, - }; - -+enum netdev_extra_priv_flags { -+ IFF_NO_IP_ALIGN = 1<<0, -+}; -+ - #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN - #define IFF_EBRIDGE IFF_EBRIDGE - #define IFF_BONDING IFF_BONDING -@@ -1680,6 +1684,7 @@ enum netdev_priv_flags { - #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER - #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK - #define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR -+#define IFF_NO_IP_ALIGN IFF_NO_IP_ALIGN - - /* Specifies the type of the struct net_device::ml_priv pointer */ - enum netdev_ml_priv_type { -@@ -1981,6 +1986,7 @@ struct net_device { - /* Read-mostly cache-line for fast-path access */ - unsigned int flags; - unsigned int priv_flags; -+ unsigned int extra_priv_flags; - const struct net_device_ops *netdev_ops; - int ifindex; - unsigned short gflags; -@@ -2041,6 +2047,11 @@ struct net_device { - const struct tlsdev_ops *tlsdev_ops; - #endif - -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ void (*eth_mangle_rx)(struct net_device *dev, struct sk_buff *skb); -+ struct sk_buff *(*eth_mangle_tx)(struct net_device *dev, struct sk_buff *skb); -+#endif -+ - const struct header_ops *header_ops; - - unsigned char operstate; -@@ -2115,6 +2126,10 @@ struct net_device { - struct mctp_dev __rcu *mctp_ptr; - #endif - -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ void *phy_ptr; /* PHY device specific data */ -+#endif -+ - /* - * Cache lines mostly used on receive path (including eth_type_trans()) - */ ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -2758,6 +2758,10 @@ static inline int pskb_trim(struct sk_bu - return (len < skb->len) ? __pskb_trim(skb, len) : 0; - } - -+extern struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, -+ unsigned int length, gfp_t gfp); -+ -+ - /** - * pskb_trim_unique - remove end from a paged unique (not cloned) buffer - * @skb: buffer to alter -@@ -2908,16 +2912,6 @@ static inline struct sk_buff *dev_alloc_ - } - - --static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, -- unsigned int length, gfp_t gfp) --{ -- struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); -- -- if (NET_IP_ALIGN && skb) -- skb_reserve(skb, NET_IP_ALIGN); -- return skb; --} -- - static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, - unsigned int length) - { ---- a/net/Kconfig -+++ b/net/Kconfig -@@ -26,6 +26,12 @@ menuconfig NET - - if NET - -+config ETHERNET_PACKET_MANGLE -+ bool -+ help -+ This option can be selected by phy drivers that need to mangle -+ packets going in or out of an ethernet device. -+ - config WANT_COMPAT_NETLINK_MESSAGES - bool - help ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -3578,6 +3578,11 @@ static int xmit_one(struct sk_buff *skb, - if (dev_nit_active(dev)) - dev_queue_xmit_nit(skb, dev); - -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ if (dev->eth_mangle_tx && !(skb = dev->eth_mangle_tx(dev, skb))) -+ return NETDEV_TX_OK; -+#endif -+ - len = skb->len; - PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); - trace_net_dev_start_xmit(skb, dev); ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -61,6 +61,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -602,6 +603,22 @@ skb_fail: - } - EXPORT_SYMBOL(__napi_alloc_skb); - -+struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, -+ unsigned int length, gfp_t gfp) -+{ -+ struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); -+ -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ if (dev && (dev->extra_priv_flags & IFF_NO_IP_ALIGN)) -+ return skb; -+#endif -+ -+ if (NET_IP_ALIGN && skb) -+ skb_reserve(skb, NET_IP_ALIGN); -+ return skb; -+} -+EXPORT_SYMBOL(__netdev_alloc_skb_ip_align); -+ - void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize) - { ---- a/net/ethernet/eth.c -+++ b/net/ethernet/eth.c -@@ -170,6 +170,12 @@ __be16 eth_type_trans(struct sk_buff *sk - const struct ethhdr *eth; - - skb->dev = dev; -+ -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ if (dev->eth_mangle_rx) -+ dev->eth_mangle_rx(dev, skb); -+#endif -+ - skb_reset_mac_header(skb); - - eth = (struct ethhdr *)skb->data; diff --git a/root/target/linux/generic/hack-5.15/760-net-usb-r8152-add-LED-configuration-from-OF.patch b/root/target/linux/generic/hack-5.15/760-net-usb-r8152-add-LED-configuration-from-OF.patch deleted file mode 100755 index 1b854608..00000000 --- a/root/target/linux/generic/hack-5.15/760-net-usb-r8152-add-LED-configuration-from-OF.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 82985725e071f2a5735052f18e109a32aeac3a0b Mon Sep 17 00:00:00 2001 -From: David Bauer -Date: Sun, 26 Jul 2020 02:38:31 +0200 -Subject: [PATCH] net: usb: r8152: add LED configuration from OF - -This adds the ability to configure the LED configuration register using -OF. This way, the correct value for board specific LED configuration can -be determined. - -Signed-off-by: David Bauer ---- - drivers/net/usb/r8152.c | 23 +++++++++++++++++++++++ - 1 file changed, 23 insertions(+) - ---- a/drivers/net/usb/r8152.c -+++ b/drivers/net/usb/r8152.c -@@ -11,6 +11,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -6822,6 +6823,22 @@ static void rtl_tally_reset(struct r8152 - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RSTTALLY, ocp_data); - } - -+static int r8152_led_configuration(struct r8152 *tp) -+{ -+ u32 led_data; -+ int ret; -+ -+ ret = of_property_read_u32(tp->udev->dev.of_node, "realtek,led-data", -+ &led_data); -+ -+ if (ret) -+ return ret; -+ -+ ocp_write_word(tp, MCU_TYPE_PLA, PLA_LEDSEL, led_data); -+ -+ return 0; -+} -+ - static void r8152b_init(struct r8152 *tp) - { - u32 ocp_data; -@@ -6863,6 +6880,8 @@ static void r8152b_init(struct r8152 *tp - ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL); - ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN); - ocp_write_word(tp, MCU_TYPE_USB, USB_USB_CTRL, ocp_data); -+ -+ r8152_led_configuration(tp); - } - - static void r8153_init(struct r8152 *tp) -@@ -7003,6 +7022,8 @@ static void r8153_init(struct r8152 *tp) - tp->coalesce = COALESCE_SLOW; - break; - } -+ -+ r8152_led_configuration(tp); - } - - static void r8153b_init(struct r8152 *tp) -@@ -7085,6 +7106,8 @@ static void r8153b_init(struct r8152 *tp - rtl_tally_reset(tp); - - tp->coalesce = 15000; /* 15 us */ -+ -+ r8152_led_configuration(tp); - } - - static void r8153c_init(struct r8152 *tp) diff --git a/root/target/linux/generic/hack-5.15/761-dt-bindings-net-add-RTL8152-binding-documentation.patch b/root/target/linux/generic/hack-5.15/761-dt-bindings-net-add-RTL8152-binding-documentation.patch deleted file mode 100755 index be262b99..00000000 --- a/root/target/linux/generic/hack-5.15/761-dt-bindings-net-add-RTL8152-binding-documentation.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 3ee05f4aa64fc86af3be5bc176ba5808de9260a7 Mon Sep 17 00:00:00 2001 -From: David Bauer -Date: Sun, 26 Jul 2020 15:30:33 +0200 -Subject: [PATCH] dt-bindings: net: add RTL8152 binding documentation - -Add binding documentation for the Realtek RTL8152 / RTL8153 USB ethernet -adapters. - -Signed-off-by: David Bauer ---- - .../bindings/net/realtek,rtl8152.yaml | 36 +++++++++++++++++++ - 1 file changed, 36 insertions(+) - create mode 100644 Documentation/devicetree/bindings/net/realtek,rtl8152.yaml - ---- /dev/null -+++ b/Documentation/devicetree/bindings/net/realtek,rtl8152.yaml -@@ -0,0 +1,36 @@ -+# SPDX-License-Identifier: GPL-2.0 -+%YAML 1.2 -+--- -+$id: http://devicetree.org/schemas/net/realtek,rtl8152.yaml# -+$schema: http://devicetree.org/meta-schemas/core.yaml# -+ -+title: Realtek RTL8152/RTL8153 series USB ethernet -+ -+maintainers: -+ - David Bauer -+ -+properties: -+ compatible: -+ oneOf: -+ - items: -+ - enum: -+ - realtek,rtl8152 -+ - realtek,rtl8153 -+ -+ reg: -+ description: The device number on the USB bus -+ -+ realtek,led-data: -+ description: Value to be written to the LED configuration register. -+ -+required: -+ - compatible -+ - reg -+ -+examples: -+ - | -+ usb-eth@2 { -+ compatible = "realtek,rtl8153"; -+ reg = <2>; -+ realtek,led-data = <0x87>; -+ }; -\ No newline at end of file diff --git a/root/target/linux/generic/hack-5.15/773-bgmac-add-srab-switch.patch b/root/target/linux/generic/hack-5.15/773-bgmac-add-srab-switch.patch index 1e4fc446..cc6eddbf 100755 --- a/root/target/linux/generic/hack-5.15/773-bgmac-add-srab-switch.patch +++ b/root/target/linux/generic/hack-5.15/773-bgmac-add-srab-switch.patch @@ -14,7 +14,7 @@ Signed-off-by: Hauke Mehrtens --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c -@@ -280,6 +280,7 @@ static int bgmac_probe(struct bcma_devic +@@ -268,6 +268,7 @@ static int bgmac_probe(struct bcma_devic bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; bgmac->feature_flags |= BGMAC_FEAT_NO_RESET; bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500; @@ -55,7 +55,7 @@ Signed-off-by: Hauke Mehrtens net_dev->max_mtu = BGMAC_RX_MAX_FRAME_SIZE - ETH_FCS_LEN; + if ((bgmac->feature_flags & BGMAC_FEAT_SRAB) && !bgmac_b53_pdata.regs) { -+ bgmac_b53_pdata.regs = ioremap(0x18007000, 0x1000); ++ bgmac_b53_pdata.regs = ioremap_nocache(0x18007000, 0x1000); + + err = platform_device_register(&bgmac_b53_dev); + if (!err) diff --git a/root/target/linux/generic/hack-5.15/800-GPIO-add-named-gpio-exports.patch b/root/target/linux/generic/hack-5.15/800-GPIO-add-named-gpio-exports.patch index 40c3309f..76f89acd 100755 --- a/root/target/linux/generic/hack-5.15/800-GPIO-add-named-gpio-exports.patch +++ b/root/target/linux/generic/hack-5.15/800-GPIO-add-named-gpio-exports.patch @@ -15,9 +15,9 @@ Signed-off-by: John Crispin #include "gpiolib.h" #include "gpiolib-of.h" -@@ -1052,3 +1054,72 @@ void of_gpio_dev_init(struct gpio_chip * - else - gc->of_node = gdev->dev.of_node; +@@ -1039,3 +1041,72 @@ void of_gpiochip_remove(struct gpio_chip + { + of_node_put(chip->of_node); } + +#ifdef CONFIG_GPIO_SYSFS @@ -129,7 +129,7 @@ Signed-off-by: John Crispin { --- a/drivers/gpio/gpiolib-sysfs.c +++ b/drivers/gpio/gpiolib-sysfs.c -@@ -564,7 +564,7 @@ static struct class gpio_class = { +@@ -572,7 +572,7 @@ static struct class gpio_class = { * * Returns zero on success, else an error. */ @@ -138,7 +138,7 @@ Signed-off-by: John Crispin { struct gpio_chip *chip; struct gpio_device *gdev; -@@ -626,6 +626,8 @@ int gpiod_export(struct gpio_desc *desc, +@@ -634,6 +634,8 @@ int gpiod_export(struct gpio_desc *desc, offset = gpio_chip_hwgpio(desc); if (chip->names && chip->names[offset]) ioname = chip->names[offset]; @@ -147,7 +147,7 @@ Signed-off-by: John Crispin dev = device_create_with_groups(&gpio_class, &gdev->dev, MKDEV(0, 0), data, gpio_groups, -@@ -647,6 +649,12 @@ err_unlock: +@@ -655,6 +657,12 @@ err_unlock: gpiod_dbg(desc, "%s: status %d\n", __func__, status); return status; } diff --git a/root/target/linux/generic/hack-5.15/901-debloat_sock_diag.patch b/root/target/linux/generic/hack-5.15/901-debloat_sock_diag.patch index 44f0e617..34e73831 100755 --- a/root/target/linux/generic/hack-5.15/901-debloat_sock_diag.patch +++ b/root/target/linux/generic/hack-5.15/901-debloat_sock_diag.patch @@ -16,7 +16,7 @@ Signed-off-by: Felix Fietkau --- a/net/Kconfig +++ b/net/Kconfig -@@ -104,6 +104,9 @@ source "net/mptcp/Kconfig" +@@ -98,6 +98,9 @@ source "net/mptcp/Kconfig" endif # if INET @@ -58,7 +58,7 @@ Signed-off-by: Felix Fietkau static void sock_inuse_add(struct net *net, int val); -@@ -545,6 +547,18 @@ discard_and_relse: +@@ -544,6 +546,18 @@ discard_and_relse: } EXPORT_SYMBOL(__sk_receive_skb); @@ -77,7 +77,7 @@ Signed-off-by: Felix Fietkau INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, -@@ -1981,9 +1995,11 @@ static void __sk_free(struct sock *sk) +@@ -1967,9 +1981,11 @@ static void __sk_free(struct sock *sk) if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); @@ -152,7 +152,7 @@ Signed-off-by: Felix Fietkau Support for PF_PACKET sockets monitoring interface used by the ss tool. --- a/net/unix/Kconfig +++ b/net/unix/Kconfig -@@ -33,6 +33,7 @@ config AF_UNIX_OOB +@@ -28,6 +28,7 @@ config UNIX_SCM config UNIX_DIAG tristate "UNIX: socket monitoring interface" depends on UNIX diff --git a/root/target/linux/generic/hack-5.15/902-debloat_proc.patch b/root/target/linux/generic/hack-5.15/902-debloat_proc.patch index 0b2d5e31..349a2c02 100755 --- a/root/target/linux/generic/hack-5.15/902-debloat_proc.patch +++ b/root/target/linux/generic/hack-5.15/902-debloat_proc.patch @@ -29,7 +29,7 @@ Signed-off-by: Felix Fietkau --- a/fs/locks.c +++ b/fs/locks.c -@@ -2929,6 +2929,8 @@ static const struct seq_operations locks +@@ -3044,6 +3044,8 @@ static const struct seq_operations locks static int __init proc_locks_init(void) { @@ -158,7 +158,7 @@ Signed-off-by: Felix Fietkau IPC_SEM_IDS, sysvipc_sem_proc_show); --- a/ipc/shm.c +++ b/ipc/shm.c -@@ -154,6 +154,8 @@ pure_initcall(ipc_ns_init); +@@ -144,6 +144,8 @@ pure_initcall(ipc_ns_init); void __init shm_init(void) { @@ -235,7 +235,7 @@ Signed-off-by: Felix Fietkau if (!pe) --- a/mm/vmalloc.c +++ b/mm/vmalloc.c -@@ -3961,6 +3961,8 @@ static const struct seq_operations vmall +@@ -3899,6 +3899,8 @@ static const struct seq_operations vmall static int __init proc_vmalloc_init(void) { @@ -246,7 +246,7 @@ Signed-off-by: Felix Fietkau &vmalloc_op, --- a/mm/vmstat.c +++ b/mm/vmstat.c -@@ -2083,10 +2083,12 @@ void __init init_mm_internals(void) +@@ -2044,10 +2044,12 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS @@ -330,7 +330,7 @@ Signed-off-by: Felix Fietkau --- a/net/core/sock.c +++ b/net/core/sock.c -@@ -3853,6 +3853,8 @@ static __net_initdata struct pernet_oper +@@ -3839,6 +3839,8 @@ static __net_initdata struct pernet_oper static int __init proto_init(void) { @@ -396,7 +396,7 @@ Signed-off-by: Felix Fietkau } --- a/net/ipv4/route.c +++ b/net/ipv4/route.c -@@ -387,6 +387,9 @@ static struct pernet_operations ip_rt_pr +@@ -386,6 +386,9 @@ static struct pernet_operations ip_rt_pr static int __init ip_rt_proc_init(void) { diff --git a/root/target/linux/generic/hack-5.15/904-debloat_dma_buf.patch b/root/target/linux/generic/hack-5.15/904-debloat_dma_buf.patch deleted file mode 100755 index fc7cd209..00000000 --- a/root/target/linux/generic/hack-5.15/904-debloat_dma_buf.patch +++ /dev/null @@ -1,92 +0,0 @@ -From e3692cb2fcd5ba1244512a0f43b8118f65f1c375 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 8 Jul 2017 08:20:43 +0200 -Subject: debloat: dmabuf - -Signed-off-by: Felix Fietkau ---- - drivers/base/Kconfig | 2 +- - drivers/dma-buf/Makefile | 10 +++++++--- - drivers/dma-buf/dma-buf.c | 4 +++- - kernel/sched/core.c | 1 + - 4 files changed, 12 insertions(+), 5 deletions(-) - ---- a/drivers/base/Kconfig -+++ b/drivers/base/Kconfig -@@ -187,7 +187,7 @@ config SOC_BUS - source "drivers/base/regmap/Kconfig" - - config DMA_SHARED_BUFFER -- bool -+ tristate - default n - select IRQ_WORK - help ---- a/drivers/dma-buf/heaps/Makefile -+++ b/drivers/dma-buf/heaps/Makefile -@@ -1,3 +1,3 @@ - # SPDX-License-Identifier: GPL-2.0 --obj-$(CONFIG_DMABUF_HEAPS_SYSTEM) += system_heap.o --obj-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o -+dma-buf-objs-$(CONFIG_DMABUF_HEAPS_SYSTEM) += system_heap.o -+dma-buf-objs-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o ---- a/drivers/dma-buf/Makefile -+++ b/drivers/dma-buf/Makefile -@@ -1,16 +1,20 @@ - # SPDX-License-Identifier: GPL-2.0-only --obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ -+obj-$(CONFIG_DMA_SHARED_BUFFER) := dma-shared-buffer.o -+ -+dma-buf-objs-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ - dma-resv.o seqno-fence.o --obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o --obj-$(CONFIG_DMABUF_HEAPS) += heaps/ --obj-$(CONFIG_SYNC_FILE) += sync_file.o --obj-$(CONFIG_SW_SYNC) += sw_sync.o sync_debug.o --obj-$(CONFIG_UDMABUF) += udmabuf.o --obj-$(CONFIG_DMABUF_SYSFS_STATS) += dma-buf-sysfs-stats.o -+dma-buf-objs-$(CONFIG_DMABUF_HEAPS) += dma-heap.o -+obj-$(CONFIG_DMABUF_HEAPS) += heaps/ -+dma-buf-objs-$(CONFIG_SYNC_FILE) += sync_file.o -+dma-buf-objs-$(CONFIG_SW_SYNC) += sw_sync.o sync_debug.o -+dma-buf-objs-$(CONFIG_UDMABUF) += udmabuf.o -+dma-buf-objs-$(CONFIG_DMABUF_SYSFS_STATS) += udmabuf.o - - dmabuf_selftests-y := \ - selftest.o \ - st-dma-fence.o \ - st-dma-fence-chain.o - --obj-$(CONFIG_DMABUF_SELFTESTS) += dmabuf_selftests.o -+dma-buf-objs-$(CONFIG_DMABUF_SELFTESTS) += dmabuf_selftests.o -+ -+dma-shared-buffer-objs := $(dma-buf-objs-y) ---- a/drivers/dma-buf/dma-buf.c -+++ b/drivers/dma-buf/dma-buf.c -@@ -1498,4 +1498,5 @@ static void __exit dma_buf_deinit(void) - kern_unmount(dma_buf_mnt); - dma_buf_uninit_sysfs_statistics(); - } --__exitcall(dma_buf_deinit); -+module_exit(dma_buf_deinit); -+MODULE_LICENSE("GPL"); ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4174,6 +4174,7 @@ int wake_up_state(struct task_struct *p, - { - return try_to_wake_up(p, state, 0); - } -+EXPORT_SYMBOL_GPL(wake_up_state); - - /* - * Perform scheduler related setup for a newly forked process p. ---- a/fs/d_path.c -+++ b/fs/d_path.c -@@ -316,6 +316,7 @@ char *dynamic_dname(struct dentry *dentr - buffer += buflen - sz; - return memcpy(buffer, temp, sz); - } -+EXPORT_SYMBOL_GPL(dynamic_dname); - - char *simple_dname(struct dentry *dentry, char *buffer, int buflen) - { diff --git a/root/target/linux/generic/hack-5.4/204-module_strip.patch b/root/target/linux/generic/hack-5.4/204-module_strip.patch deleted file mode 100755 index d6e25f31..00000000 --- a/root/target/linux/generic/hack-5.4/204-module_strip.patch +++ /dev/null @@ -1,220 +0,0 @@ -From a779a482fb9b9f8fcdf8b2519c789b4b9bb5dd05 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 16:56:48 +0200 -Subject: build: add a hack for removing non-essential module info - -Signed-off-by: Felix Fietkau ---- - include/linux/module.h | 13 ++++++++----- - include/linux/moduleparam.h | 15 ++++++++++++--- - init/Kconfig | 7 +++++++ - kernel/module.c | 5 ++++- - scripts/mod/modpost.c | 12 ++++++++++++ - 5 files changed, 43 insertions(+), 9 deletions(-) - ---- a/include/linux/module.h -+++ b/include/linux/module.h -@@ -157,6 +157,7 @@ extern void cleanup_module(void); - - /* Generic info of form tag = "info" */ - #define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) -+#define MODULE_INFO_STRIP(tag, info) __MODULE_INFO_STRIP(tag, tag, info) - - /* For userspace: you can also call me... */ - #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias) -@@ -216,12 +217,12 @@ extern void cleanup_module(void); - * Author(s), use "Name " or just "Name", for multiple - * authors use multiple MODULE_AUTHOR() statements/lines. - */ --#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author) -+#define MODULE_AUTHOR(_author) MODULE_INFO_STRIP(author, _author) - - /* What your module does. */ --#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) -+#define MODULE_DESCRIPTION(_description) MODULE_INFO_STRIP(description, _description) - --#ifdef MODULE -+#if defined(MODULE) && !defined(CONFIG_MODULE_STRIPPED) - /* Creates an alias so file2alias.c can find device table. */ - #define MODULE_DEVICE_TABLE(type, name) \ - extern typeof(name) __mod_##type##__##name##_device_table \ -@@ -248,7 +249,9 @@ extern typeof(name) __mod_##type##__##na - */ - - #if defined(MODULE) || !defined(CONFIG_SYSFS) --#define MODULE_VERSION(_version) MODULE_INFO(version, _version) -+#define MODULE_VERSION(_version) MODULE_INFO_STRIP(version, _version) -+#elif defined(CONFIG_MODULE_STRIPPED) -+#define MODULE_VERSION(_version) __MODULE_INFO_DISABLED(version) - #else - #define MODULE_VERSION(_version) \ - MODULE_INFO(version, _version); \ -@@ -271,7 +274,7 @@ extern typeof(name) __mod_##type##__##na - /* Optional firmware file (or files) needed by the module - * format is simply firmware file name. Multiple firmware - * files require multiple MODULE_FIRMWARE() specifiers */ --#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) -+#define MODULE_FIRMWARE(_firmware) MODULE_INFO_STRIP(firmware, _firmware) - - #define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns) - ---- a/include/linux/moduleparam.h -+++ b/include/linux/moduleparam.h -@@ -20,10 +20,24 @@ - /* Chosen so that structs with an unsigned long line up. */ - #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) - -+/* This struct is here for syntactic coherency, it is not used */ -+#define __MODULE_INFO_DISABLED(name) \ -+ struct __UNIQUE_ID(name) {} -+ -+#ifdef CONFIG_MODULE_STRIPPED -+#define __MODULE_INFO_STRIP(tag, name, info) __MODULE_INFO_DISABLED(name) -+#else -+#define __MODULE_INFO_STRIP(tag, name, info) __MODULE_INFO(tag, name, info) -+#endif -+ -+#ifdef MODULE - #define __MODULE_INFO(tag, name, info) \ - static const char __UNIQUE_ID(name)[] \ - __used __attribute__((section(".modinfo"), unused, aligned(1))) \ - = __MODULE_INFO_PREFIX __stringify(tag) "=" info -+#else -+#define __MODULE_INFO(tag, name, info) __MODULE_INFO_DISABLED(name) -+#endif - - #define __MODULE_PARM_TYPE(name, _type) \ - __MODULE_INFO(parmtype, name##type, #name ":" _type) -@@ -31,7 +45,7 @@ static const char __UNIQUE_ID(name)[] - /* One for each parameter, describing how to use it. Some files do - multiple of these per line, so can't just use MODULE_INFO. */ - #define MODULE_PARM_DESC(_parm, desc) \ -- __MODULE_INFO(parm, _parm, #_parm ":" desc) -+ __MODULE_INFO_STRIP(parm, _parm, #_parm ":" desc) - - struct kernel_param; - ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -2198,6 +2198,13 @@ config TRIM_UNUSED_KSYMS - - If unsure, or if you need to build out-of-tree modules, say N. - -+config MODULE_STRIPPED -+ bool "Reduce module size" -+ depends on MODULES -+ help -+ Remove module parameter descriptions, author info, version, aliases, -+ device tables, etc. -+ - endif # MODULES - - config MODULES_TREE_LOOKUP ---- a/kernel/module.c -+++ b/kernel/module.c -@@ -1281,6 +1281,7 @@ static struct module_attribute *modinfo_ - - static const char vermagic[] = VERMAGIC_STRING; - -+#if defined(CONFIG_MODVERSIONS) || !defined(CONFIG_MODULE_STRIPPED) - static int try_to_force_load(struct module *mod, const char *reason) - { - #ifdef CONFIG_MODULE_FORCE_LOAD -@@ -1292,6 +1293,7 @@ static int try_to_force_load(struct modu - return -ENOEXEC; - #endif - } -+#endif - - #ifdef CONFIG_MODVERSIONS - -@@ -3256,9 +3258,11 @@ static int setup_load_info(struct load_i - - static int check_modinfo(struct module *mod, struct load_info *info, int flags) - { -- const char *modmagic = get_modinfo(info, "vermagic"); - int err; - -+#ifndef CONFIG_MODULE_STRIPPED -+ const char *modmagic = get_modinfo(info, "vermagic"); -+ - if (flags & MODULE_INIT_IGNORE_VERMAGIC) - modmagic = NULL; - -@@ -3279,6 +3283,7 @@ static int check_modinfo(struct module * - mod->name); - add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); - } -+#endif - - check_modinfo_retpoline(mod, info); - ---- a/scripts/mod/modpost.c -+++ b/scripts/mod/modpost.c -@@ -2056,7 +2056,9 @@ static void read_symbols(const char *mod - symname = remove_dot(info.strtab + sym->st_name); - - handle_modversions(mod, &info, sym, symname); -+#ifndef CONFIG_MODULE_STRIPPED - handle_moddevtable(mod, &info, sym, symname); -+#endif - } - - /* Apply symbol namespaces from __kstrtabns_ entries. */ -@@ -2270,8 +2272,10 @@ static void add_header(struct buffer *b, - buf_printf(b, "\n"); - buf_printf(b, "BUILD_SALT;\n"); - buf_printf(b, "\n"); -+#ifndef CONFIG_MODULE_STRIPPED - buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n"); - buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n"); -+#endif - buf_printf(b, "\n"); - buf_printf(b, "__visible struct module __this_module\n"); - buf_printf(b, "__section(.gnu.linkonce.this_module) = {\n"); -@@ -2288,8 +2292,10 @@ static void add_header(struct buffer *b, - - static void add_intree_flag(struct buffer *b, int is_intree) - { -+#ifndef CONFIG_MODULE_STRIPPED - if (is_intree) - buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n"); -+#endif - } - - /* Cannot check for assembler */ -@@ -2302,8 +2308,10 @@ static void add_retpoline(struct buffer - - static void add_staging_flag(struct buffer *b, const char *name) - { -+#ifndef CONFIG_MODULE_STRIPPED - if (strstarts(name, "drivers/staging")) - buf_printf(b, "\nMODULE_INFO(staging, \"Y\");\n"); -+#endif - } - - /** -@@ -2387,11 +2395,13 @@ static void add_depends(struct buffer *b - - static void add_srcversion(struct buffer *b, struct module *mod) - { -+#ifndef CONFIG_MODULE_STRIPPED - if (mod->srcversion[0]) { - buf_printf(b, "\n"); - buf_printf(b, "MODULE_INFO(srcversion, \"%s\");\n", - mod->srcversion); - } -+#endif - } - - static void write_if_changed(struct buffer *b, const char *fname) -@@ -2661,7 +2671,9 @@ int main(int argc, char **argv) - add_staging_flag(&buf, mod->name); - err |= add_versions(&buf, mod); - add_depends(&buf, mod); -+#ifndef CONFIG_MODULE_STRIPPED - add_moddevtable(&buf, mod); -+#endif - add_srcversion(&buf, mod); - - sprintf(fname, "%s.mod.c", mod->name); diff --git a/root/target/linux/generic/hack-5.4/205-kconfig-exit.patch b/root/target/linux/generic/hack-5.4/205-kconfig-exit.patch deleted file mode 100755 index 8931ad32..00000000 --- a/root/target/linux/generic/hack-5.4/205-kconfig-exit.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- a/scripts/kconfig/conf.c -+++ b/scripts/kconfig/conf.c -@@ -212,6 +212,8 @@ static int conf_sym(struct menu *menu) - break; - continue; - case 0: -+ if (!sym_has_value(sym) && !tty_stdio && getenv("FAIL_ON_UNCONFIGURED")) -+ exit(1); - newval = oldval; - break; - case '?': diff --git a/root/target/linux/generic/hack-5.4/210-darwin_scripts_include.patch b/root/target/linux/generic/hack-5.4/210-darwin_scripts_include.patch deleted file mode 100755 index be6adc0d..00000000 --- a/root/target/linux/generic/hack-5.4/210-darwin_scripts_include.patch +++ /dev/null @@ -1,3053 +0,0 @@ -From db7c30dcd9a0391bf13b62c9f91e144d762ef43a Mon Sep 17 00:00:00 2001 -From: Florian Fainelli -Date: Fri, 7 Jul 2017 17:00:49 +0200 -Subject: Add an OSX specific patch to make the kernel be compiled - -lede-commit: 3fc2a24f0422b2f55f9ed43f116db3111f700526 -Signed-off-by: Florian Fainelli ---- - scripts/kconfig/Makefile | 3 + - scripts/mod/elf.h | 3007 ++++++++++++++++++++++++++++++++++++++++++++ - scripts/mod/mk_elfconfig.c | 4 + - scripts/mod/modpost.h | 4 + - 4 files changed, 3018 insertions(+) - create mode 100644 scripts/mod/elf.h - ---- /dev/null -+++ b/scripts/mod/elf.h -@@ -0,0 +1,3007 @@ -+/* This file defines standard ELF types, structures, and macros. -+ Copyright (C) 1995-2012 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef _ELF_H -+#define _ELF_H 1 -+ -+/* Standard ELF types. */ -+ -+#include -+ -+/* Type for a 16-bit quantity. */ -+typedef uint16_t Elf32_Half; -+typedef uint16_t Elf64_Half; -+ -+/* Types for signed and unsigned 32-bit quantities. */ -+typedef uint32_t Elf32_Word; -+typedef int32_t Elf32_Sword; -+typedef uint32_t Elf64_Word; -+typedef int32_t Elf64_Sword; -+ -+/* Types for signed and unsigned 64-bit quantities. */ -+typedef uint64_t Elf32_Xword; -+typedef int64_t Elf32_Sxword; -+typedef uint64_t Elf64_Xword; -+typedef int64_t Elf64_Sxword; -+ -+/* Type of addresses. */ -+typedef uint32_t Elf32_Addr; -+typedef uint64_t Elf64_Addr; -+ -+/* Type of file offsets. */ -+typedef uint32_t Elf32_Off; -+typedef uint64_t Elf64_Off; -+ -+/* Type for section indices, which are 16-bit quantities. */ -+typedef uint16_t Elf32_Section; -+typedef uint16_t Elf64_Section; -+ -+/* Type for version symbol information. */ -+typedef Elf32_Half Elf32_Versym; -+typedef Elf64_Half Elf64_Versym; -+ -+ -+/* The ELF file header. This appears at the start of every ELF file. */ -+ -+#define EI_NIDENT (16) -+ -+typedef struct -+{ -+ unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */ -+ Elf32_Half e_type; /* Object file type */ -+ Elf32_Half e_machine; /* Architecture */ -+ Elf32_Word e_version; /* Object file version */ -+ Elf32_Addr e_entry; /* Entry point virtual address */ -+ Elf32_Off e_phoff; /* Program header table file offset */ -+ Elf32_Off e_shoff; /* Section header table file offset */ -+ Elf32_Word e_flags; /* Processor-specific flags */ -+ Elf32_Half e_ehsize; /* ELF header size in bytes */ -+ Elf32_Half e_phentsize; /* Program header table entry size */ -+ Elf32_Half e_phnum; /* Program header table entry count */ -+ Elf32_Half e_shentsize; /* Section header table entry size */ -+ Elf32_Half e_shnum; /* Section header table entry count */ -+ Elf32_Half e_shstrndx; /* Section header string table index */ -+} Elf32_Ehdr; -+ -+typedef struct -+{ -+ unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */ -+ Elf64_Half e_type; /* Object file type */ -+ Elf64_Half e_machine; /* Architecture */ -+ Elf64_Word e_version; /* Object file version */ -+ Elf64_Addr e_entry; /* Entry point virtual address */ -+ Elf64_Off e_phoff; /* Program header table file offset */ -+ Elf64_Off e_shoff; /* Section header table file offset */ -+ Elf64_Word e_flags; /* Processor-specific flags */ -+ Elf64_Half e_ehsize; /* ELF header size in bytes */ -+ Elf64_Half e_phentsize; /* Program header table entry size */ -+ Elf64_Half e_phnum; /* Program header table entry count */ -+ Elf64_Half e_shentsize; /* Section header table entry size */ -+ Elf64_Half e_shnum; /* Section header table entry count */ -+ Elf64_Half e_shstrndx; /* Section header string table index */ -+} Elf64_Ehdr; -+ -+/* Fields in the e_ident array. The EI_* macros are indices into the -+ array. The macros under each EI_* macro are the values the byte -+ may have. */ -+ -+#define EI_MAG0 0 /* File identification byte 0 index */ -+#define ELFMAG0 0x7f /* Magic number byte 0 */ -+ -+#define EI_MAG1 1 /* File identification byte 1 index */ -+#define ELFMAG1 'E' /* Magic number byte 1 */ -+ -+#define EI_MAG2 2 /* File identification byte 2 index */ -+#define ELFMAG2 'L' /* Magic number byte 2 */ -+ -+#define EI_MAG3 3 /* File identification byte 3 index */ -+#define ELFMAG3 'F' /* Magic number byte 3 */ -+ -+/* Conglomeration of the identification bytes, for easy testing as a word. */ -+#define ELFMAG "\177ELF" -+#define SELFMAG 4 -+ -+#define EI_CLASS 4 /* File class byte index */ -+#define ELFCLASSNONE 0 /* Invalid class */ -+#define ELFCLASS32 1 /* 32-bit objects */ -+#define ELFCLASS64 2 /* 64-bit objects */ -+#define ELFCLASSNUM 3 -+ -+#define EI_DATA 5 /* Data encoding byte index */ -+#define ELFDATANONE 0 /* Invalid data encoding */ -+#define ELFDATA2LSB 1 /* 2's complement, little endian */ -+#define ELFDATA2MSB 2 /* 2's complement, big endian */ -+#define ELFDATANUM 3 -+ -+#define EI_VERSION 6 /* File version byte index */ -+ /* Value must be EV_CURRENT */ -+ -+#define EI_OSABI 7 /* OS ABI identification */ -+#define ELFOSABI_NONE 0 /* UNIX System V ABI */ -+#define ELFOSABI_SYSV 0 /* Alias. */ -+#define ELFOSABI_HPUX 1 /* HP-UX */ -+#define ELFOSABI_NETBSD 2 /* NetBSD. */ -+#define ELFOSABI_GNU 3 /* Object uses GNU ELF extensions. */ -+#define ELFOSABI_LINUX ELFOSABI_GNU /* Compatibility alias. */ -+#define ELFOSABI_SOLARIS 6 /* Sun Solaris. */ -+#define ELFOSABI_AIX 7 /* IBM AIX. */ -+#define ELFOSABI_IRIX 8 /* SGI Irix. */ -+#define ELFOSABI_FREEBSD 9 /* FreeBSD. */ -+#define ELFOSABI_TRU64 10 /* Compaq TRU64 UNIX. */ -+#define ELFOSABI_MODESTO 11 /* Novell Modesto. */ -+#define ELFOSABI_OPENBSD 12 /* OpenBSD. */ -+#define ELFOSABI_ARM_AEABI 64 /* ARM EABI */ -+#define ELFOSABI_ARM 97 /* ARM */ -+#define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */ -+ -+#define EI_ABIVERSION 8 /* ABI version */ -+ -+#define EI_PAD 9 /* Byte index of padding bytes */ -+ -+/* Legal values for e_type (object file type). */ -+ -+#define ET_NONE 0 /* No file type */ -+#define ET_REL 1 /* Relocatable file */ -+#define ET_EXEC 2 /* Executable file */ -+#define ET_DYN 3 /* Shared object file */ -+#define ET_CORE 4 /* Core file */ -+#define ET_NUM 5 /* Number of defined types */ -+#define ET_LOOS 0xfe00 /* OS-specific range start */ -+#define ET_HIOS 0xfeff /* OS-specific range end */ -+#define ET_LOPROC 0xff00 /* Processor-specific range start */ -+#define ET_HIPROC 0xffff /* Processor-specific range end */ -+ -+/* Legal values for e_machine (architecture). */ -+ -+#define EM_NONE 0 /* No machine */ -+#define EM_M32 1 /* AT&T WE 32100 */ -+#define EM_SPARC 2 /* SUN SPARC */ -+#define EM_386 3 /* Intel 80386 */ -+#define EM_68K 4 /* Motorola m68k family */ -+#define EM_88K 5 /* Motorola m88k family */ -+#define EM_860 7 /* Intel 80860 */ -+#define EM_MIPS 8 /* MIPS R3000 big-endian */ -+#define EM_S370 9 /* IBM System/370 */ -+#define EM_MIPS_RS3_LE 10 /* MIPS R3000 little-endian */ -+ -+#define EM_PARISC 15 /* HPPA */ -+#define EM_VPP500 17 /* Fujitsu VPP500 */ -+#define EM_SPARC32PLUS 18 /* Sun's "v8plus" */ -+#define EM_960 19 /* Intel 80960 */ -+#define EM_PPC 20 /* PowerPC */ -+#define EM_PPC64 21 /* PowerPC 64-bit */ -+#define EM_S390 22 /* IBM S390 */ -+ -+#define EM_V800 36 /* NEC V800 series */ -+#define EM_FR20 37 /* Fujitsu FR20 */ -+#define EM_RH32 38 /* TRW RH-32 */ -+#define EM_RCE 39 /* Motorola RCE */ -+#define EM_ARM 40 /* ARM */ -+#define EM_FAKE_ALPHA 41 /* Digital Alpha */ -+#define EM_SH 42 /* Hitachi SH */ -+#define EM_SPARCV9 43 /* SPARC v9 64-bit */ -+#define EM_TRICORE 44 /* Siemens Tricore */ -+#define EM_ARC 45 /* Argonaut RISC Core */ -+#define EM_H8_300 46 /* Hitachi H8/300 */ -+#define EM_H8_300H 47 /* Hitachi H8/300H */ -+#define EM_H8S 48 /* Hitachi H8S */ -+#define EM_H8_500 49 /* Hitachi H8/500 */ -+#define EM_IA_64 50 /* Intel Merced */ -+#define EM_MIPS_X 51 /* Stanford MIPS-X */ -+#define EM_COLDFIRE 52 /* Motorola Coldfire */ -+#define EM_68HC12 53 /* Motorola M68HC12 */ -+#define EM_MMA 54 /* Fujitsu MMA Multimedia Accelerator*/ -+#define EM_PCP 55 /* Siemens PCP */ -+#define EM_NCPU 56 /* Sony nCPU embeeded RISC */ -+#define EM_NDR1 57 /* Denso NDR1 microprocessor */ -+#define EM_STARCORE 58 /* Motorola Start*Core processor */ -+#define EM_ME16 59 /* Toyota ME16 processor */ -+#define EM_ST100 60 /* STMicroelectronic ST100 processor */ -+#define EM_TINYJ 61 /* Advanced Logic Corp. Tinyj emb.fam*/ -+#define EM_X86_64 62 /* AMD x86-64 architecture */ -+#define EM_PDSP 63 /* Sony DSP Processor */ -+ -+#define EM_FX66 66 /* Siemens FX66 microcontroller */ -+#define EM_ST9PLUS 67 /* STMicroelectronics ST9+ 8/16 mc */ -+#define EM_ST7 68 /* STmicroelectronics ST7 8 bit mc */ -+#define EM_68HC16 69 /* Motorola MC68HC16 microcontroller */ -+#define EM_68HC11 70 /* Motorola MC68HC11 microcontroller */ -+#define EM_68HC08 71 /* Motorola MC68HC08 microcontroller */ -+#define EM_68HC05 72 /* Motorola MC68HC05 microcontroller */ -+#define EM_SVX 73 /* Silicon Graphics SVx */ -+#define EM_ST19 74 /* STMicroelectronics ST19 8 bit mc */ -+#define EM_VAX 75 /* Digital VAX */ -+#define EM_CRIS 76 /* Axis Communications 32-bit embedded processor */ -+#define EM_JAVELIN 77 /* Infineon Technologies 32-bit embedded processor */ -+#define EM_FIREPATH 78 /* Element 14 64-bit DSP Processor */ -+#define EM_ZSP 79 /* LSI Logic 16-bit DSP Processor */ -+#define EM_MMIX 80 /* Donald Knuth's educational 64-bit processor */ -+#define EM_HUANY 81 /* Harvard University machine-independent object files */ -+#define EM_PRISM 82 /* SiTera Prism */ -+#define EM_AVR 83 /* Atmel AVR 8-bit microcontroller */ -+#define EM_FR30 84 /* Fujitsu FR30 */ -+#define EM_D10V 85 /* Mitsubishi D10V */ -+#define EM_D30V 86 /* Mitsubishi D30V */ -+#define EM_V850 87 /* NEC v850 */ -+#define EM_M32R 88 /* Mitsubishi M32R */ -+#define EM_MN10300 89 /* Matsushita MN10300 */ -+#define EM_MN10200 90 /* Matsushita MN10200 */ -+#define EM_PJ 91 /* picoJava */ -+#define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */ -+#define EM_ARC_A5 93 /* ARC Cores Tangent-A5 */ -+#define EM_XTENSA 94 /* Tensilica Xtensa Architecture */ -+#define EM_TILEPRO 188 /* Tilera TILEPro */ -+#define EM_TILEGX 191 /* Tilera TILE-Gx */ -+#define EM_NUM 192 -+ -+/* If it is necessary to assign new unofficial EM_* values, please -+ pick large random numbers (0x8523, 0xa7f2, etc.) to minimize the -+ chances of collision with official or non-GNU unofficial values. */ -+ -+#define EM_ALPHA 0x9026 -+ -+/* Legal values for e_version (version). */ -+ -+#define EV_NONE 0 /* Invalid ELF version */ -+#define EV_CURRENT 1 /* Current version */ -+#define EV_NUM 2 -+ -+/* Section header. */ -+ -+typedef struct -+{ -+ Elf32_Word sh_name; /* Section name (string tbl index) */ -+ Elf32_Word sh_type; /* Section type */ -+ Elf32_Word sh_flags; /* Section flags */ -+ Elf32_Addr sh_addr; /* Section virtual addr at execution */ -+ Elf32_Off sh_offset; /* Section file offset */ -+ Elf32_Word sh_size; /* Section size in bytes */ -+ Elf32_Word sh_link; /* Link to another section */ -+ Elf32_Word sh_info; /* Additional section information */ -+ Elf32_Word sh_addralign; /* Section alignment */ -+ Elf32_Word sh_entsize; /* Entry size if section holds table */ -+} Elf32_Shdr; -+ -+typedef struct -+{ -+ Elf64_Word sh_name; /* Section name (string tbl index) */ -+ Elf64_Word sh_type; /* Section type */ -+ Elf64_Xword sh_flags; /* Section flags */ -+ Elf64_Addr sh_addr; /* Section virtual addr at execution */ -+ Elf64_Off sh_offset; /* Section file offset */ -+ Elf64_Xword sh_size; /* Section size in bytes */ -+ Elf64_Word sh_link; /* Link to another section */ -+ Elf64_Word sh_info; /* Additional section information */ -+ Elf64_Xword sh_addralign; /* Section alignment */ -+ Elf64_Xword sh_entsize; /* Entry size if section holds table */ -+} Elf64_Shdr; -+ -+/* Special section indices. */ -+ -+#define SHN_UNDEF 0 /* Undefined section */ -+#define SHN_LORESERVE 0xff00 /* Start of reserved indices */ -+#define SHN_LOPROC 0xff00 /* Start of processor-specific */ -+#define SHN_BEFORE 0xff00 /* Order section before all others -+ (Solaris). */ -+#define SHN_AFTER 0xff01 /* Order section after all others -+ (Solaris). */ -+#define SHN_HIPROC 0xff1f /* End of processor-specific */ -+#define SHN_LOOS 0xff20 /* Start of OS-specific */ -+#define SHN_HIOS 0xff3f /* End of OS-specific */ -+#define SHN_ABS 0xfff1 /* Associated symbol is absolute */ -+#define SHN_COMMON 0xfff2 /* Associated symbol is common */ -+#define SHN_XINDEX 0xffff /* Index is in extra table. */ -+#define SHN_HIRESERVE 0xffff /* End of reserved indices */ -+ -+/* Legal values for sh_type (section type). */ -+ -+#define SHT_NULL 0 /* Section header table entry unused */ -+#define SHT_PROGBITS 1 /* Program data */ -+#define SHT_SYMTAB 2 /* Symbol table */ -+#define SHT_STRTAB 3 /* String table */ -+#define SHT_RELA 4 /* Relocation entries with addends */ -+#define SHT_HASH 5 /* Symbol hash table */ -+#define SHT_DYNAMIC 6 /* Dynamic linking information */ -+#define SHT_NOTE 7 /* Notes */ -+#define SHT_NOBITS 8 /* Program space with no data (bss) */ -+#define SHT_REL 9 /* Relocation entries, no addends */ -+#define SHT_SHLIB 10 /* Reserved */ -+#define SHT_DYNSYM 11 /* Dynamic linker symbol table */ -+#define SHT_INIT_ARRAY 14 /* Array of constructors */ -+#define SHT_FINI_ARRAY 15 /* Array of destructors */ -+#define SHT_PREINIT_ARRAY 16 /* Array of pre-constructors */ -+#define SHT_GROUP 17 /* Section group */ -+#define SHT_SYMTAB_SHNDX 18 /* Extended section indeces */ -+#define SHT_NUM 19 /* Number of defined types. */ -+#define SHT_LOOS 0x60000000 /* Start OS-specific. */ -+#define SHT_GNU_ATTRIBUTES 0x6ffffff5 /* Object attributes. */ -+#define SHT_GNU_HASH 0x6ffffff6 /* GNU-style hash table. */ -+#define SHT_GNU_LIBLIST 0x6ffffff7 /* Prelink library list */ -+#define SHT_CHECKSUM 0x6ffffff8 /* Checksum for DSO content. */ -+#define SHT_LOSUNW 0x6ffffffa /* Sun-specific low bound. */ -+#define SHT_SUNW_move 0x6ffffffa -+#define SHT_SUNW_COMDAT 0x6ffffffb -+#define SHT_SUNW_syminfo 0x6ffffffc -+#define SHT_GNU_verdef 0x6ffffffd /* Version definition section. */ -+#define SHT_GNU_verneed 0x6ffffffe /* Version needs section. */ -+#define SHT_GNU_versym 0x6fffffff /* Version symbol table. */ -+#define SHT_HISUNW 0x6fffffff /* Sun-specific high bound. */ -+#define SHT_HIOS 0x6fffffff /* End OS-specific type */ -+#define SHT_LOPROC 0x70000000 /* Start of processor-specific */ -+#define SHT_HIPROC 0x7fffffff /* End of processor-specific */ -+#define SHT_LOUSER 0x80000000 /* Start of application-specific */ -+#define SHT_HIUSER 0x8fffffff /* End of application-specific */ -+ -+/* Legal values for sh_flags (section flags). */ -+ -+#define SHF_WRITE (1 << 0) /* Writable */ -+#define SHF_ALLOC (1 << 1) /* Occupies memory during execution */ -+#define SHF_EXECINSTR (1 << 2) /* Executable */ -+#define SHF_MERGE (1 << 4) /* Might be merged */ -+#define SHF_STRINGS (1 << 5) /* Contains nul-terminated strings */ -+#define SHF_INFO_LINK (1 << 6) /* `sh_info' contains SHT index */ -+#define SHF_LINK_ORDER (1 << 7) /* Preserve order after combining */ -+#define SHF_OS_NONCONFORMING (1 << 8) /* Non-standard OS specific handling -+ required */ -+#define SHF_GROUP (1 << 9) /* Section is member of a group. */ -+#define SHF_TLS (1 << 10) /* Section hold thread-local data. */ -+#define SHF_MASKOS 0x0ff00000 /* OS-specific. */ -+#define SHF_MASKPROC 0xf0000000 /* Processor-specific */ -+#define SHF_ORDERED (1 << 30) /* Special ordering requirement -+ (Solaris). */ -+#define SHF_EXCLUDE (1 << 31) /* Section is excluded unless -+ referenced or allocated (Solaris).*/ -+ -+/* Section group handling. */ -+#define GRP_COMDAT 0x1 /* Mark group as COMDAT. */ -+ -+/* Symbol table entry. */ -+ -+typedef struct -+{ -+ Elf32_Word st_name; /* Symbol name (string tbl index) */ -+ Elf32_Addr st_value; /* Symbol value */ -+ Elf32_Word st_size; /* Symbol size */ -+ unsigned char st_info; /* Symbol type and binding */ -+ unsigned char st_other; /* Symbol visibility */ -+ Elf32_Section st_shndx; /* Section index */ -+} Elf32_Sym; -+ -+typedef struct -+{ -+ Elf64_Word st_name; /* Symbol name (string tbl index) */ -+ unsigned char st_info; /* Symbol type and binding */ -+ unsigned char st_other; /* Symbol visibility */ -+ Elf64_Section st_shndx; /* Section index */ -+ Elf64_Addr st_value; /* Symbol value */ -+ Elf64_Xword st_size; /* Symbol size */ -+} Elf64_Sym; -+ -+/* The syminfo section if available contains additional information about -+ every dynamic symbol. */ -+ -+typedef struct -+{ -+ Elf32_Half si_boundto; /* Direct bindings, symbol bound to */ -+ Elf32_Half si_flags; /* Per symbol flags */ -+} Elf32_Syminfo; -+ -+typedef struct -+{ -+ Elf64_Half si_boundto; /* Direct bindings, symbol bound to */ -+ Elf64_Half si_flags; /* Per symbol flags */ -+} Elf64_Syminfo; -+ -+/* Possible values for si_boundto. */ -+#define SYMINFO_BT_SELF 0xffff /* Symbol bound to self */ -+#define SYMINFO_BT_PARENT 0xfffe /* Symbol bound to parent */ -+#define SYMINFO_BT_LOWRESERVE 0xff00 /* Beginning of reserved entries */ -+ -+/* Possible bitmasks for si_flags. */ -+#define SYMINFO_FLG_DIRECT 0x0001 /* Direct bound symbol */ -+#define SYMINFO_FLG_PASSTHRU 0x0002 /* Pass-thru symbol for translator */ -+#define SYMINFO_FLG_COPY 0x0004 /* Symbol is a copy-reloc */ -+#define SYMINFO_FLG_LAZYLOAD 0x0008 /* Symbol bound to object to be lazy -+ loaded */ -+/* Syminfo version values. */ -+#define SYMINFO_NONE 0 -+#define SYMINFO_CURRENT 1 -+#define SYMINFO_NUM 2 -+ -+ -+/* How to extract and insert information held in the st_info field. */ -+ -+#define ELF32_ST_BIND(val) (((unsigned char) (val)) >> 4) -+#define ELF32_ST_TYPE(val) ((val) & 0xf) -+#define ELF32_ST_INFO(bind, type) (((bind) << 4) + ((type) & 0xf)) -+ -+/* Both Elf32_Sym and Elf64_Sym use the same one-byte st_info field. */ -+#define ELF64_ST_BIND(val) ELF32_ST_BIND (val) -+#define ELF64_ST_TYPE(val) ELF32_ST_TYPE (val) -+#define ELF64_ST_INFO(bind, type) ELF32_ST_INFO ((bind), (type)) -+ -+/* Legal values for ST_BIND subfield of st_info (symbol binding). */ -+ -+#define STB_LOCAL 0 /* Local symbol */ -+#define STB_GLOBAL 1 /* Global symbol */ -+#define STB_WEAK 2 /* Weak symbol */ -+#define STB_NUM 3 /* Number of defined types. */ -+#define STB_LOOS 10 /* Start of OS-specific */ -+#define STB_GNU_UNIQUE 10 /* Unique symbol. */ -+#define STB_HIOS 12 /* End of OS-specific */ -+#define STB_LOPROC 13 /* Start of processor-specific */ -+#define STB_HIPROC 15 /* End of processor-specific */ -+ -+/* Legal values for ST_TYPE subfield of st_info (symbol type). */ -+ -+#define STT_NOTYPE 0 /* Symbol type is unspecified */ -+#define STT_OBJECT 1 /* Symbol is a data object */ -+#define STT_FUNC 2 /* Symbol is a code object */ -+#define STT_SECTION 3 /* Symbol associated with a section */ -+#define STT_FILE 4 /* Symbol's name is file name */ -+#define STT_COMMON 5 /* Symbol is a common data object */ -+#define STT_TLS 6 /* Symbol is thread-local data object*/ -+#define STT_NUM 7 /* Number of defined types. */ -+#define STT_LOOS 10 /* Start of OS-specific */ -+#define STT_GNU_IFUNC 10 /* Symbol is indirect code object */ -+#define STT_HIOS 12 /* End of OS-specific */ -+#define STT_LOPROC 13 /* Start of processor-specific */ -+#define STT_HIPROC 15 /* End of processor-specific */ -+ -+ -+/* Symbol table indices are found in the hash buckets and chain table -+ of a symbol hash table section. This special index value indicates -+ the end of a chain, meaning no further symbols are found in that bucket. */ -+ -+#define STN_UNDEF 0 /* End of a chain. */ -+ -+ -+/* How to extract and insert information held in the st_other field. */ -+ -+#define ELF32_ST_VISIBILITY(o) ((o) & 0x03) -+ -+/* For ELF64 the definitions are the same. */ -+#define ELF64_ST_VISIBILITY(o) ELF32_ST_VISIBILITY (o) -+ -+/* Symbol visibility specification encoded in the st_other field. */ -+#define STV_DEFAULT 0 /* Default symbol visibility rules */ -+#define STV_INTERNAL 1 /* Processor specific hidden class */ -+#define STV_HIDDEN 2 /* Sym unavailable in other modules */ -+#define STV_PROTECTED 3 /* Not preemptible, not exported */ -+ -+ -+/* Relocation table entry without addend (in section of type SHT_REL). */ -+ -+typedef struct -+{ -+ Elf32_Addr r_offset; /* Address */ -+ Elf32_Word r_info; /* Relocation type and symbol index */ -+} Elf32_Rel; -+ -+/* I have seen two different definitions of the Elf64_Rel and -+ Elf64_Rela structures, so we'll leave them out until Novell (or -+ whoever) gets their act together. */ -+/* The following, at least, is used on Sparc v9, MIPS, and Alpha. */ -+ -+typedef struct -+{ -+ Elf64_Addr r_offset; /* Address */ -+ Elf64_Xword r_info; /* Relocation type and symbol index */ -+} Elf64_Rel; -+ -+/* Relocation table entry with addend (in section of type SHT_RELA). */ -+ -+typedef struct -+{ -+ Elf32_Addr r_offset; /* Address */ -+ Elf32_Word r_info; /* Relocation type and symbol index */ -+ Elf32_Sword r_addend; /* Addend */ -+} Elf32_Rela; -+ -+typedef struct -+{ -+ Elf64_Addr r_offset; /* Address */ -+ Elf64_Xword r_info; /* Relocation type and symbol index */ -+ Elf64_Sxword r_addend; /* Addend */ -+} Elf64_Rela; -+ -+/* How to extract and insert information held in the r_info field. */ -+ -+#define ELF32_R_SYM(val) ((val) >> 8) -+#define ELF32_R_TYPE(val) ((val) & 0xff) -+#define ELF32_R_INFO(sym, type) (((sym) << 8) + ((type) & 0xff)) -+ -+#define ELF64_R_SYM(i) ((i) >> 32) -+#define ELF64_R_TYPE(i) ((i) & 0xffffffff) -+#define ELF64_R_INFO(sym,type) ((((Elf64_Xword) (sym)) << 32) + (type)) -+ -+/* Program segment header. */ -+ -+typedef struct -+{ -+ Elf32_Word p_type; /* Segment type */ -+ Elf32_Off p_offset; /* Segment file offset */ -+ Elf32_Addr p_vaddr; /* Segment virtual address */ -+ Elf32_Addr p_paddr; /* Segment physical address */ -+ Elf32_Word p_filesz; /* Segment size in file */ -+ Elf32_Word p_memsz; /* Segment size in memory */ -+ Elf32_Word p_flags; /* Segment flags */ -+ Elf32_Word p_align; /* Segment alignment */ -+} Elf32_Phdr; -+ -+typedef struct -+{ -+ Elf64_Word p_type; /* Segment type */ -+ Elf64_Word p_flags; /* Segment flags */ -+ Elf64_Off p_offset; /* Segment file offset */ -+ Elf64_Addr p_vaddr; /* Segment virtual address */ -+ Elf64_Addr p_paddr; /* Segment physical address */ -+ Elf64_Xword p_filesz; /* Segment size in file */ -+ Elf64_Xword p_memsz; /* Segment size in memory */ -+ Elf64_Xword p_align; /* Segment alignment */ -+} Elf64_Phdr; -+ -+/* Special value for e_phnum. This indicates that the real number of -+ program headers is too large to fit into e_phnum. Instead the real -+ value is in the field sh_info of section 0. */ -+ -+#define PN_XNUM 0xffff -+ -+/* Legal values for p_type (segment type). */ -+ -+#define PT_NULL 0 /* Program header table entry unused */ -+#define PT_LOAD 1 /* Loadable program segment */ -+#define PT_DYNAMIC 2 /* Dynamic linking information */ -+#define PT_INTERP 3 /* Program interpreter */ -+#define PT_NOTE 4 /* Auxiliary information */ -+#define PT_SHLIB 5 /* Reserved */ -+#define PT_PHDR 6 /* Entry for header table itself */ -+#define PT_TLS 7 /* Thread-local storage segment */ -+#define PT_NUM 8 /* Number of defined types */ -+#define PT_LOOS 0x60000000 /* Start of OS-specific */ -+#define PT_GNU_EH_FRAME 0x6474e550 /* GCC .eh_frame_hdr segment */ -+#define PT_GNU_STACK 0x6474e551 /* Indicates stack executability */ -+#define PT_GNU_RELRO 0x6474e552 /* Read-only after relocation */ -+#define PT_LOSUNW 0x6ffffffa -+#define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment */ -+#define PT_SUNWSTACK 0x6ffffffb /* Stack segment */ -+#define PT_HISUNW 0x6fffffff -+#define PT_HIOS 0x6fffffff /* End of OS-specific */ -+#define PT_LOPROC 0x70000000 /* Start of processor-specific */ -+#define PT_HIPROC 0x7fffffff /* End of processor-specific */ -+ -+/* Legal values for p_flags (segment flags). */ -+ -+#define PF_X (1 << 0) /* Segment is executable */ -+#define PF_W (1 << 1) /* Segment is writable */ -+#define PF_R (1 << 2) /* Segment is readable */ -+#define PF_MASKOS 0x0ff00000 /* OS-specific */ -+#define PF_MASKPROC 0xf0000000 /* Processor-specific */ -+ -+/* Legal values for note segment descriptor types for core files. */ -+ -+#define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ -+#define NT_FPREGSET 2 /* Contains copy of fpregset struct */ -+#define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ -+#define NT_PRXREG 4 /* Contains copy of prxregset struct */ -+#define NT_TASKSTRUCT 4 /* Contains copy of task structure */ -+#define NT_PLATFORM 5 /* String from sysinfo(SI_PLATFORM) */ -+#define NT_AUXV 6 /* Contains copy of auxv array */ -+#define NT_GWINDOWS 7 /* Contains copy of gwindows struct */ -+#define NT_ASRS 8 /* Contains copy of asrset struct */ -+#define NT_PSTATUS 10 /* Contains copy of pstatus struct */ -+#define NT_PSINFO 13 /* Contains copy of psinfo struct */ -+#define NT_PRCRED 14 /* Contains copy of prcred struct */ -+#define NT_UTSNAME 15 /* Contains copy of utsname struct */ -+#define NT_LWPSTATUS 16 /* Contains copy of lwpstatus struct */ -+#define NT_LWPSINFO 17 /* Contains copy of lwpinfo struct */ -+#define NT_PRFPXREG 20 /* Contains copy of fprxregset struct */ -+#define NT_PRXFPREG 0x46e62b7f /* Contains copy of user_fxsr_struct */ -+#define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ -+#define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ -+#define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ -+#define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ -+#define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ -+#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ -+ -+/* Legal values for the note segment descriptor types for object files. */ -+ -+#define NT_VERSION 1 /* Contains a version string. */ -+ -+ -+/* Dynamic section entry. */ -+ -+typedef struct -+{ -+ Elf32_Sword d_tag; /* Dynamic entry type */ -+ union -+ { -+ Elf32_Word d_val; /* Integer value */ -+ Elf32_Addr d_ptr; /* Address value */ -+ } d_un; -+} Elf32_Dyn; -+ -+typedef struct -+{ -+ Elf64_Sxword d_tag; /* Dynamic entry type */ -+ union -+ { -+ Elf64_Xword d_val; /* Integer value */ -+ Elf64_Addr d_ptr; /* Address value */ -+ } d_un; -+} Elf64_Dyn; -+ -+/* Legal values for d_tag (dynamic entry type). */ -+ -+#define DT_NULL 0 /* Marks end of dynamic section */ -+#define DT_NEEDED 1 /* Name of needed library */ -+#define DT_PLTRELSZ 2 /* Size in bytes of PLT relocs */ -+#define DT_PLTGOT 3 /* Processor defined value */ -+#define DT_HASH 4 /* Address of symbol hash table */ -+#define DT_STRTAB 5 /* Address of string table */ -+#define DT_SYMTAB 6 /* Address of symbol table */ -+#define DT_RELA 7 /* Address of Rela relocs */ -+#define DT_RELASZ 8 /* Total size of Rela relocs */ -+#define DT_RELAENT 9 /* Size of one Rela reloc */ -+#define DT_STRSZ 10 /* Size of string table */ -+#define DT_SYMENT 11 /* Size of one symbol table entry */ -+#define DT_INIT 12 /* Address of init function */ -+#define DT_FINI 13 /* Address of termination function */ -+#define DT_SONAME 14 /* Name of shared object */ -+#define DT_RPATH 15 /* Library search path (deprecated) */ -+#define DT_SYMBOLIC 16 /* Start symbol search here */ -+#define DT_REL 17 /* Address of Rel relocs */ -+#define DT_RELSZ 18 /* Total size of Rel relocs */ -+#define DT_RELENT 19 /* Size of one Rel reloc */ -+#define DT_PLTREL 20 /* Type of reloc in PLT */ -+#define DT_DEBUG 21 /* For debugging; unspecified */ -+#define DT_TEXTREL 22 /* Reloc might modify .text */ -+#define DT_JMPREL 23 /* Address of PLT relocs */ -+#define DT_BIND_NOW 24 /* Process relocations of object */ -+#define DT_INIT_ARRAY 25 /* Array with addresses of init fct */ -+#define DT_FINI_ARRAY 26 /* Array with addresses of fini fct */ -+#define DT_INIT_ARRAYSZ 27 /* Size in bytes of DT_INIT_ARRAY */ -+#define DT_FINI_ARRAYSZ 28 /* Size in bytes of DT_FINI_ARRAY */ -+#define DT_RUNPATH 29 /* Library search path */ -+#define DT_FLAGS 30 /* Flags for the object being loaded */ -+#define DT_ENCODING 32 /* Start of encoded range */ -+#define DT_PREINIT_ARRAY 32 /* Array with addresses of preinit fct*/ -+#define DT_PREINIT_ARRAYSZ 33 /* size in bytes of DT_PREINIT_ARRAY */ -+#define DT_NUM 34 /* Number used */ -+#define DT_LOOS 0x6000000d /* Start of OS-specific */ -+#define DT_HIOS 0x6ffff000 /* End of OS-specific */ -+#define DT_LOPROC 0x70000000 /* Start of processor-specific */ -+#define DT_HIPROC 0x7fffffff /* End of processor-specific */ -+#define DT_PROCNUM DT_MIPS_NUM /* Most used by any processor */ -+ -+/* DT_* entries which fall between DT_VALRNGHI & DT_VALRNGLO use the -+ Dyn.d_un.d_val field of the Elf*_Dyn structure. This follows Sun's -+ approach. */ -+#define DT_VALRNGLO 0x6ffffd00 -+#define DT_GNU_PRELINKED 0x6ffffdf5 /* Prelinking timestamp */ -+#define DT_GNU_CONFLICTSZ 0x6ffffdf6 /* Size of conflict section */ -+#define DT_GNU_LIBLISTSZ 0x6ffffdf7 /* Size of library list */ -+#define DT_CHECKSUM 0x6ffffdf8 -+#define DT_PLTPADSZ 0x6ffffdf9 -+#define DT_MOVEENT 0x6ffffdfa -+#define DT_MOVESZ 0x6ffffdfb -+#define DT_FEATURE_1 0x6ffffdfc /* Feature selection (DTF_*). */ -+#define DT_POSFLAG_1 0x6ffffdfd /* Flags for DT_* entries, effecting -+ the following DT_* entry. */ -+#define DT_SYMINSZ 0x6ffffdfe /* Size of syminfo table (in bytes) */ -+#define DT_SYMINENT 0x6ffffdff /* Entry size of syminfo */ -+#define DT_VALRNGHI 0x6ffffdff -+#define DT_VALTAGIDX(tag) (DT_VALRNGHI - (tag)) /* Reverse order! */ -+#define DT_VALNUM 12 -+ -+/* DT_* entries which fall between DT_ADDRRNGHI & DT_ADDRRNGLO use the -+ Dyn.d_un.d_ptr field of the Elf*_Dyn structure. -+ -+ If any adjustment is made to the ELF object after it has been -+ built these entries will need to be adjusted. */ -+#define DT_ADDRRNGLO 0x6ffffe00 -+#define DT_GNU_HASH 0x6ffffef5 /* GNU-style hash table. */ -+#define DT_TLSDESC_PLT 0x6ffffef6 -+#define DT_TLSDESC_GOT 0x6ffffef7 -+#define DT_GNU_CONFLICT 0x6ffffef8 /* Start of conflict section */ -+#define DT_GNU_LIBLIST 0x6ffffef9 /* Library list */ -+#define DT_CONFIG 0x6ffffefa /* Configuration information. */ -+#define DT_DEPAUDIT 0x6ffffefb /* Dependency auditing. */ -+#define DT_AUDIT 0x6ffffefc /* Object auditing. */ -+#define DT_PLTPAD 0x6ffffefd /* PLT padding. */ -+#define DT_MOVETAB 0x6ffffefe /* Move table. */ -+#define DT_SYMINFO 0x6ffffeff /* Syminfo table. */ -+#define DT_ADDRRNGHI 0x6ffffeff -+#define DT_ADDRTAGIDX(tag) (DT_ADDRRNGHI - (tag)) /* Reverse order! */ -+#define DT_ADDRNUM 11 -+ -+/* The versioning entry types. The next are defined as part of the -+ GNU extension. */ -+#define DT_VERSYM 0x6ffffff0 -+ -+#define DT_RELACOUNT 0x6ffffff9 -+#define DT_RELCOUNT 0x6ffffffa -+ -+/* These were chosen by Sun. */ -+#define DT_FLAGS_1 0x6ffffffb /* State flags, see DF_1_* below. */ -+#define DT_VERDEF 0x6ffffffc /* Address of version definition -+ table */ -+#define DT_VERDEFNUM 0x6ffffffd /* Number of version definitions */ -+#define DT_VERNEED 0x6ffffffe /* Address of table with needed -+ versions */ -+#define DT_VERNEEDNUM 0x6fffffff /* Number of needed versions */ -+#define DT_VERSIONTAGIDX(tag) (DT_VERNEEDNUM - (tag)) /* Reverse order! */ -+#define DT_VERSIONTAGNUM 16 -+ -+/* Sun added these machine-independent extensions in the "processor-specific" -+ range. Be compatible. */ -+#define DT_AUXILIARY 0x7ffffffd /* Shared object to load before self */ -+#define DT_FILTER 0x7fffffff /* Shared object to get values from */ -+#define DT_EXTRATAGIDX(tag) ((Elf32_Word)-((Elf32_Sword) (tag) <<1>>1)-1) -+#define DT_EXTRANUM 3 -+ -+/* Values of `d_un.d_val' in the DT_FLAGS entry. */ -+#define DF_ORIGIN 0x00000001 /* Object may use DF_ORIGIN */ -+#define DF_SYMBOLIC 0x00000002 /* Symbol resolutions starts here */ -+#define DF_TEXTREL 0x00000004 /* Object contains text relocations */ -+#define DF_BIND_NOW 0x00000008 /* No lazy binding for this object */ -+#define DF_STATIC_TLS 0x00000010 /* Module uses the static TLS model */ -+ -+/* State flags selectable in the `d_un.d_val' element of the DT_FLAGS_1 -+ entry in the dynamic section. */ -+#define DF_1_NOW 0x00000001 /* Set RTLD_NOW for this object. */ -+#define DF_1_GLOBAL 0x00000002 /* Set RTLD_GLOBAL for this object. */ -+#define DF_1_GROUP 0x00000004 /* Set RTLD_GROUP for this object. */ -+#define DF_1_NODELETE 0x00000008 /* Set RTLD_NODELETE for this object.*/ -+#define DF_1_LOADFLTR 0x00000010 /* Trigger filtee loading at runtime.*/ -+#define DF_1_INITFIRST 0x00000020 /* Set RTLD_INITFIRST for this object*/ -+#define DF_1_NOOPEN 0x00000040 /* Set RTLD_NOOPEN for this object. */ -+#define DF_1_ORIGIN 0x00000080 /* $ORIGIN must be handled. */ -+#define DF_1_DIRECT 0x00000100 /* Direct binding enabled. */ -+#define DF_1_TRANS 0x00000200 -+#define DF_1_INTERPOSE 0x00000400 /* Object is used to interpose. */ -+#define DF_1_NODEFLIB 0x00000800 /* Ignore default lib search path. */ -+#define DF_1_NODUMP 0x00001000 /* Object can't be dldump'ed. */ -+#define DF_1_CONFALT 0x00002000 /* Configuration alternative created.*/ -+#define DF_1_ENDFILTEE 0x00004000 /* Filtee terminates filters search. */ -+#define DF_1_DISPRELDNE 0x00008000 /* Disp reloc applied at build time. */ -+#define DF_1_DISPRELPND 0x00010000 /* Disp reloc applied at run-time. */ -+ -+/* Flags for the feature selection in DT_FEATURE_1. */ -+#define DTF_1_PARINIT 0x00000001 -+#define DTF_1_CONFEXP 0x00000002 -+ -+/* Flags in the DT_POSFLAG_1 entry effecting only the next DT_* entry. */ -+#define DF_P1_LAZYLOAD 0x00000001 /* Lazyload following object. */ -+#define DF_P1_GROUPPERM 0x00000002 /* Symbols from next object are not -+ generally available. */ -+ -+/* Version definition sections. */ -+ -+typedef struct -+{ -+ Elf32_Half vd_version; /* Version revision */ -+ Elf32_Half vd_flags; /* Version information */ -+ Elf32_Half vd_ndx; /* Version Index */ -+ Elf32_Half vd_cnt; /* Number of associated aux entries */ -+ Elf32_Word vd_hash; /* Version name hash value */ -+ Elf32_Word vd_aux; /* Offset in bytes to verdaux array */ -+ Elf32_Word vd_next; /* Offset in bytes to next verdef -+ entry */ -+} Elf32_Verdef; -+ -+typedef struct -+{ -+ Elf64_Half vd_version; /* Version revision */ -+ Elf64_Half vd_flags; /* Version information */ -+ Elf64_Half vd_ndx; /* Version Index */ -+ Elf64_Half vd_cnt; /* Number of associated aux entries */ -+ Elf64_Word vd_hash; /* Version name hash value */ -+ Elf64_Word vd_aux; /* Offset in bytes to verdaux array */ -+ Elf64_Word vd_next; /* Offset in bytes to next verdef -+ entry */ -+} Elf64_Verdef; -+ -+ -+/* Legal values for vd_version (version revision). */ -+#define VER_DEF_NONE 0 /* No version */ -+#define VER_DEF_CURRENT 1 /* Current version */ -+#define VER_DEF_NUM 2 /* Given version number */ -+ -+/* Legal values for vd_flags (version information flags). */ -+#define VER_FLG_BASE 0x1 /* Version definition of file itself */ -+#define VER_FLG_WEAK 0x2 /* Weak version identifier */ -+ -+/* Versym symbol index values. */ -+#define VER_NDX_LOCAL 0 /* Symbol is local. */ -+#define VER_NDX_GLOBAL 1 /* Symbol is global. */ -+#define VER_NDX_LORESERVE 0xff00 /* Beginning of reserved entries. */ -+#define VER_NDX_ELIMINATE 0xff01 /* Symbol is to be eliminated. */ -+ -+/* Auxialiary version information. */ -+ -+typedef struct -+{ -+ Elf32_Word vda_name; /* Version or dependency names */ -+ Elf32_Word vda_next; /* Offset in bytes to next verdaux -+ entry */ -+} Elf32_Verdaux; -+ -+typedef struct -+{ -+ Elf64_Word vda_name; /* Version or dependency names */ -+ Elf64_Word vda_next; /* Offset in bytes to next verdaux -+ entry */ -+} Elf64_Verdaux; -+ -+ -+/* Version dependency section. */ -+ -+typedef struct -+{ -+ Elf32_Half vn_version; /* Version of structure */ -+ Elf32_Half vn_cnt; /* Number of associated aux entries */ -+ Elf32_Word vn_file; /* Offset of filename for this -+ dependency */ -+ Elf32_Word vn_aux; /* Offset in bytes to vernaux array */ -+ Elf32_Word vn_next; /* Offset in bytes to next verneed -+ entry */ -+} Elf32_Verneed; -+ -+typedef struct -+{ -+ Elf64_Half vn_version; /* Version of structure */ -+ Elf64_Half vn_cnt; /* Number of associated aux entries */ -+ Elf64_Word vn_file; /* Offset of filename for this -+ dependency */ -+ Elf64_Word vn_aux; /* Offset in bytes to vernaux array */ -+ Elf64_Word vn_next; /* Offset in bytes to next verneed -+ entry */ -+} Elf64_Verneed; -+ -+ -+/* Legal values for vn_version (version revision). */ -+#define VER_NEED_NONE 0 /* No version */ -+#define VER_NEED_CURRENT 1 /* Current version */ -+#define VER_NEED_NUM 2 /* Given version number */ -+ -+/* Auxiliary needed version information. */ -+ -+typedef struct -+{ -+ Elf32_Word vna_hash; /* Hash value of dependency name */ -+ Elf32_Half vna_flags; /* Dependency specific information */ -+ Elf32_Half vna_other; /* Unused */ -+ Elf32_Word vna_name; /* Dependency name string offset */ -+ Elf32_Word vna_next; /* Offset in bytes to next vernaux -+ entry */ -+} Elf32_Vernaux; -+ -+typedef struct -+{ -+ Elf64_Word vna_hash; /* Hash value of dependency name */ -+ Elf64_Half vna_flags; /* Dependency specific information */ -+ Elf64_Half vna_other; /* Unused */ -+ Elf64_Word vna_name; /* Dependency name string offset */ -+ Elf64_Word vna_next; /* Offset in bytes to next vernaux -+ entry */ -+} Elf64_Vernaux; -+ -+ -+/* Legal values for vna_flags. */ -+#define VER_FLG_WEAK 0x2 /* Weak version identifier */ -+ -+ -+/* Auxiliary vector. */ -+ -+/* This vector is normally only used by the program interpreter. The -+ usual definition in an ABI supplement uses the name auxv_t. The -+ vector is not usually defined in a standard file, but it -+ can't hurt. We rename it to avoid conflicts. The sizes of these -+ types are an arrangement between the exec server and the program -+ interpreter, so we don't fully specify them here. */ -+ -+typedef struct -+{ -+ uint32_t a_type; /* Entry type */ -+ union -+ { -+ uint32_t a_val; /* Integer value */ -+ /* We use to have pointer elements added here. We cannot do that, -+ though, since it does not work when using 32-bit definitions -+ on 64-bit platforms and vice versa. */ -+ } a_un; -+} Elf32_auxv_t; -+ -+typedef struct -+{ -+ uint64_t a_type; /* Entry type */ -+ union -+ { -+ uint64_t a_val; /* Integer value */ -+ /* We use to have pointer elements added here. We cannot do that, -+ though, since it does not work when using 32-bit definitions -+ on 64-bit platforms and vice versa. */ -+ } a_un; -+} Elf64_auxv_t; -+ -+/* Legal values for a_type (entry type). */ -+ -+#define AT_NULL 0 /* End of vector */ -+#define AT_IGNORE 1 /* Entry should be ignored */ -+#define AT_EXECFD 2 /* File descriptor of program */ -+#define AT_PHDR 3 /* Program headers for program */ -+#define AT_PHENT 4 /* Size of program header entry */ -+#define AT_PHNUM 5 /* Number of program headers */ -+#define AT_PAGESZ 6 /* System page size */ -+#define AT_BASE 7 /* Base address of interpreter */ -+#define AT_FLAGS 8 /* Flags */ -+#define AT_ENTRY 9 /* Entry point of program */ -+#define AT_NOTELF 10 /* Program is not ELF */ -+#define AT_UID 11 /* Real uid */ -+#define AT_EUID 12 /* Effective uid */ -+#define AT_GID 13 /* Real gid */ -+#define AT_EGID 14 /* Effective gid */ -+#define AT_CLKTCK 17 /* Frequency of times() */ -+ -+/* Some more special a_type values describing the hardware. */ -+#define AT_PLATFORM 15 /* String identifying platform. */ -+#define AT_HWCAP 16 /* Machine dependent hints about -+ processor capabilities. */ -+ -+/* This entry gives some information about the FPU initialization -+ performed by the kernel. */ -+#define AT_FPUCW 18 /* Used FPU control word. */ -+ -+/* Cache block sizes. */ -+#define AT_DCACHEBSIZE 19 /* Data cache block size. */ -+#define AT_ICACHEBSIZE 20 /* Instruction cache block size. */ -+#define AT_UCACHEBSIZE 21 /* Unified cache block size. */ -+ -+/* A special ignored value for PPC, used by the kernel to control the -+ interpretation of the AUXV. Must be > 16. */ -+#define AT_IGNOREPPC 22 /* Entry should be ignored. */ -+ -+#define AT_SECURE 23 /* Boolean, was exec setuid-like? */ -+ -+#define AT_BASE_PLATFORM 24 /* String identifying real platforms.*/ -+ -+#define AT_RANDOM 25 /* Address of 16 random bytes. */ -+ -+#define AT_EXECFN 31 /* Filename of executable. */ -+ -+/* Pointer to the global system page used for system calls and other -+ nice things. */ -+#define AT_SYSINFO 32 -+#define AT_SYSINFO_EHDR 33 -+ -+/* Shapes of the caches. Bits 0-3 contains associativity; bits 4-7 contains -+ log2 of line size; mask those to get cache size. */ -+#define AT_L1I_CACHESHAPE 34 -+#define AT_L1D_CACHESHAPE 35 -+#define AT_L2_CACHESHAPE 36 -+#define AT_L3_CACHESHAPE 37 -+ -+/* Note section contents. Each entry in the note section begins with -+ a header of a fixed form. */ -+ -+typedef struct -+{ -+ Elf32_Word n_namesz; /* Length of the note's name. */ -+ Elf32_Word n_descsz; /* Length of the note's descriptor. */ -+ Elf32_Word n_type; /* Type of the note. */ -+} Elf32_Nhdr; -+ -+typedef struct -+{ -+ Elf64_Word n_namesz; /* Length of the note's name. */ -+ Elf64_Word n_descsz; /* Length of the note's descriptor. */ -+ Elf64_Word n_type; /* Type of the note. */ -+} Elf64_Nhdr; -+ -+/* Known names of notes. */ -+ -+/* Solaris entries in the note section have this name. */ -+#define ELF_NOTE_SOLARIS "SUNW Solaris" -+ -+/* Note entries for GNU systems have this name. */ -+#define ELF_NOTE_GNU "GNU" -+ -+ -+/* Defined types of notes for Solaris. */ -+ -+/* Value of descriptor (one word) is desired pagesize for the binary. */ -+#define ELF_NOTE_PAGESIZE_HINT 1 -+ -+ -+/* Defined note types for GNU systems. */ -+ -+/* ABI information. The descriptor consists of words: -+ word 0: OS descriptor -+ word 1: major version of the ABI -+ word 2: minor version of the ABI -+ word 3: subminor version of the ABI -+*/ -+#define NT_GNU_ABI_TAG 1 -+#define ELF_NOTE_ABI NT_GNU_ABI_TAG /* Old name. */ -+ -+/* Known OSes. These values can appear in word 0 of an -+ NT_GNU_ABI_TAG note section entry. */ -+#define ELF_NOTE_OS_LINUX 0 -+#define ELF_NOTE_OS_GNU 1 -+#define ELF_NOTE_OS_SOLARIS2 2 -+#define ELF_NOTE_OS_FREEBSD 3 -+ -+/* Synthetic hwcap information. The descriptor begins with two words: -+ word 0: number of entries -+ word 1: bitmask of enabled entries -+ Then follow variable-length entries, one byte followed by a -+ '\0'-terminated hwcap name string. The byte gives the bit -+ number to test if enabled, (1U << bit) & bitmask. */ -+#define NT_GNU_HWCAP 2 -+ -+/* Build ID bits as generated by ld --build-id. -+ The descriptor consists of any nonzero number of bytes. */ -+#define NT_GNU_BUILD_ID 3 -+ -+/* Version note generated by GNU gold containing a version string. */ -+#define NT_GNU_GOLD_VERSION 4 -+ -+ -+/* Move records. */ -+typedef struct -+{ -+ Elf32_Xword m_value; /* Symbol value. */ -+ Elf32_Word m_info; /* Size and index. */ -+ Elf32_Word m_poffset; /* Symbol offset. */ -+ Elf32_Half m_repeat; /* Repeat count. */ -+ Elf32_Half m_stride; /* Stride info. */ -+} Elf32_Move; -+ -+typedef struct -+{ -+ Elf64_Xword m_value; /* Symbol value. */ -+ Elf64_Xword m_info; /* Size and index. */ -+ Elf64_Xword m_poffset; /* Symbol offset. */ -+ Elf64_Half m_repeat; /* Repeat count. */ -+ Elf64_Half m_stride; /* Stride info. */ -+} Elf64_Move; -+ -+/* Macro to construct move records. */ -+#define ELF32_M_SYM(info) ((info) >> 8) -+#define ELF32_M_SIZE(info) ((unsigned char) (info)) -+#define ELF32_M_INFO(sym, size) (((sym) << 8) + (unsigned char) (size)) -+ -+#define ELF64_M_SYM(info) ELF32_M_SYM (info) -+#define ELF64_M_SIZE(info) ELF32_M_SIZE (info) -+#define ELF64_M_INFO(sym, size) ELF32_M_INFO (sym, size) -+ -+ -+/* Motorola 68k specific definitions. */ -+ -+/* Values for Elf32_Ehdr.e_flags. */ -+#define EF_CPU32 0x00810000 -+ -+/* m68k relocs. */ -+ -+#define R_68K_NONE 0 /* No reloc */ -+#define R_68K_32 1 /* Direct 32 bit */ -+#define R_68K_16 2 /* Direct 16 bit */ -+#define R_68K_8 3 /* Direct 8 bit */ -+#define R_68K_PC32 4 /* PC relative 32 bit */ -+#define R_68K_PC16 5 /* PC relative 16 bit */ -+#define R_68K_PC8 6 /* PC relative 8 bit */ -+#define R_68K_GOT32 7 /* 32 bit PC relative GOT entry */ -+#define R_68K_GOT16 8 /* 16 bit PC relative GOT entry */ -+#define R_68K_GOT8 9 /* 8 bit PC relative GOT entry */ -+#define R_68K_GOT32O 10 /* 32 bit GOT offset */ -+#define R_68K_GOT16O 11 /* 16 bit GOT offset */ -+#define R_68K_GOT8O 12 /* 8 bit GOT offset */ -+#define R_68K_PLT32 13 /* 32 bit PC relative PLT address */ -+#define R_68K_PLT16 14 /* 16 bit PC relative PLT address */ -+#define R_68K_PLT8 15 /* 8 bit PC relative PLT address */ -+#define R_68K_PLT32O 16 /* 32 bit PLT offset */ -+#define R_68K_PLT16O 17 /* 16 bit PLT offset */ -+#define R_68K_PLT8O 18 /* 8 bit PLT offset */ -+#define R_68K_COPY 19 /* Copy symbol at runtime */ -+#define R_68K_GLOB_DAT 20 /* Create GOT entry */ -+#define R_68K_JMP_SLOT 21 /* Create PLT entry */ -+#define R_68K_RELATIVE 22 /* Adjust by program base */ -+#define R_68K_TLS_GD32 25 /* 32 bit GOT offset for GD */ -+#define R_68K_TLS_GD16 26 /* 16 bit GOT offset for GD */ -+#define R_68K_TLS_GD8 27 /* 8 bit GOT offset for GD */ -+#define R_68K_TLS_LDM32 28 /* 32 bit GOT offset for LDM */ -+#define R_68K_TLS_LDM16 29 /* 16 bit GOT offset for LDM */ -+#define R_68K_TLS_LDM8 30 /* 8 bit GOT offset for LDM */ -+#define R_68K_TLS_LDO32 31 /* 32 bit module-relative offset */ -+#define R_68K_TLS_LDO16 32 /* 16 bit module-relative offset */ -+#define R_68K_TLS_LDO8 33 /* 8 bit module-relative offset */ -+#define R_68K_TLS_IE32 34 /* 32 bit GOT offset for IE */ -+#define R_68K_TLS_IE16 35 /* 16 bit GOT offset for IE */ -+#define R_68K_TLS_IE8 36 /* 8 bit GOT offset for IE */ -+#define R_68K_TLS_LE32 37 /* 32 bit offset relative to -+ static TLS block */ -+#define R_68K_TLS_LE16 38 /* 16 bit offset relative to -+ static TLS block */ -+#define R_68K_TLS_LE8 39 /* 8 bit offset relative to -+ static TLS block */ -+#define R_68K_TLS_DTPMOD32 40 /* 32 bit module number */ -+#define R_68K_TLS_DTPREL32 41 /* 32 bit module-relative offset */ -+#define R_68K_TLS_TPREL32 42 /* 32 bit TP-relative offset */ -+/* Keep this the last entry. */ -+#define R_68K_NUM 43 -+ -+/* Intel 80386 specific definitions. */ -+ -+/* i386 relocs. */ -+ -+#define R_386_NONE 0 /* No reloc */ -+#define R_386_32 1 /* Direct 32 bit */ -+#define R_386_PC32 2 /* PC relative 32 bit */ -+#define R_386_GOT32 3 /* 32 bit GOT entry */ -+#define R_386_PLT32 4 /* 32 bit PLT address */ -+#define R_386_COPY 5 /* Copy symbol at runtime */ -+#define R_386_GLOB_DAT 6 /* Create GOT entry */ -+#define R_386_JMP_SLOT 7 /* Create PLT entry */ -+#define R_386_RELATIVE 8 /* Adjust by program base */ -+#define R_386_GOTOFF 9 /* 32 bit offset to GOT */ -+#define R_386_GOTPC 10 /* 32 bit PC relative offset to GOT */ -+#define R_386_32PLT 11 -+#define R_386_TLS_TPOFF 14 /* Offset in static TLS block */ -+#define R_386_TLS_IE 15 /* Address of GOT entry for static TLS -+ block offset */ -+#define R_386_TLS_GOTIE 16 /* GOT entry for static TLS block -+ offset */ -+#define R_386_TLS_LE 17 /* Offset relative to static TLS -+ block */ -+#define R_386_TLS_GD 18 /* Direct 32 bit for GNU version of -+ general dynamic thread local data */ -+#define R_386_TLS_LDM 19 /* Direct 32 bit for GNU version of -+ local dynamic thread local data -+ in LE code */ -+#define R_386_16 20 -+#define R_386_PC16 21 -+#define R_386_8 22 -+#define R_386_PC8 23 -+#define R_386_TLS_GD_32 24 /* Direct 32 bit for general dynamic -+ thread local data */ -+#define R_386_TLS_GD_PUSH 25 /* Tag for pushl in GD TLS code */ -+#define R_386_TLS_GD_CALL 26 /* Relocation for call to -+ __tls_get_addr() */ -+#define R_386_TLS_GD_POP 27 /* Tag for popl in GD TLS code */ -+#define R_386_TLS_LDM_32 28 /* Direct 32 bit for local dynamic -+ thread local data in LE code */ -+#define R_386_TLS_LDM_PUSH 29 /* Tag for pushl in LDM TLS code */ -+#define R_386_TLS_LDM_CALL 30 /* Relocation for call to -+ __tls_get_addr() in LDM code */ -+#define R_386_TLS_LDM_POP 31 /* Tag for popl in LDM TLS code */ -+#define R_386_TLS_LDO_32 32 /* Offset relative to TLS block */ -+#define R_386_TLS_IE_32 33 /* GOT entry for negated static TLS -+ block offset */ -+#define R_386_TLS_LE_32 34 /* Negated offset relative to static -+ TLS block */ -+#define R_386_TLS_DTPMOD32 35 /* ID of module containing symbol */ -+#define R_386_TLS_DTPOFF32 36 /* Offset in TLS block */ -+#define R_386_TLS_TPOFF32 37 /* Negated offset in static TLS block */ -+/* 38? */ -+#define R_386_TLS_GOTDESC 39 /* GOT offset for TLS descriptor. */ -+#define R_386_TLS_DESC_CALL 40 /* Marker of call through TLS -+ descriptor for -+ relaxation. */ -+#define R_386_TLS_DESC 41 /* TLS descriptor containing -+ pointer to code and to -+ argument, returning the TLS -+ offset for the symbol. */ -+#define R_386_IRELATIVE 42 /* Adjust indirectly by program base */ -+/* Keep this the last entry. */ -+#define R_386_NUM 43 -+ -+/* SUN SPARC specific definitions. */ -+ -+/* Legal values for ST_TYPE subfield of st_info (symbol type). */ -+ -+#define STT_SPARC_REGISTER 13 /* Global register reserved to app. */ -+ -+/* Values for Elf64_Ehdr.e_flags. */ -+ -+#define EF_SPARCV9_MM 3 -+#define EF_SPARCV9_TSO 0 -+#define EF_SPARCV9_PSO 1 -+#define EF_SPARCV9_RMO 2 -+#define EF_SPARC_LEDATA 0x800000 /* little endian data */ -+#define EF_SPARC_EXT_MASK 0xFFFF00 -+#define EF_SPARC_32PLUS 0x000100 /* generic V8+ features */ -+#define EF_SPARC_SUN_US1 0x000200 /* Sun UltraSPARC1 extensions */ -+#define EF_SPARC_HAL_R1 0x000400 /* HAL R1 extensions */ -+#define EF_SPARC_SUN_US3 0x000800 /* Sun UltraSPARCIII extensions */ -+ -+/* SPARC relocs. */ -+ -+#define R_SPARC_NONE 0 /* No reloc */ -+#define R_SPARC_8 1 /* Direct 8 bit */ -+#define R_SPARC_16 2 /* Direct 16 bit */ -+#define R_SPARC_32 3 /* Direct 32 bit */ -+#define R_SPARC_DISP8 4 /* PC relative 8 bit */ -+#define R_SPARC_DISP16 5 /* PC relative 16 bit */ -+#define R_SPARC_DISP32 6 /* PC relative 32 bit */ -+#define R_SPARC_WDISP30 7 /* PC relative 30 bit shifted */ -+#define R_SPARC_WDISP22 8 /* PC relative 22 bit shifted */ -+#define R_SPARC_HI22 9 /* High 22 bit */ -+#define R_SPARC_22 10 /* Direct 22 bit */ -+#define R_SPARC_13 11 /* Direct 13 bit */ -+#define R_SPARC_LO10 12 /* Truncated 10 bit */ -+#define R_SPARC_GOT10 13 /* Truncated 10 bit GOT entry */ -+#define R_SPARC_GOT13 14 /* 13 bit GOT entry */ -+#define R_SPARC_GOT22 15 /* 22 bit GOT entry shifted */ -+#define R_SPARC_PC10 16 /* PC relative 10 bit truncated */ -+#define R_SPARC_PC22 17 /* PC relative 22 bit shifted */ -+#define R_SPARC_WPLT30 18 /* 30 bit PC relative PLT address */ -+#define R_SPARC_COPY 19 /* Copy symbol at runtime */ -+#define R_SPARC_GLOB_DAT 20 /* Create GOT entry */ -+#define R_SPARC_JMP_SLOT 21 /* Create PLT entry */ -+#define R_SPARC_RELATIVE 22 /* Adjust by program base */ -+#define R_SPARC_UA32 23 /* Direct 32 bit unaligned */ -+ -+/* Additional Sparc64 relocs. */ -+ -+#define R_SPARC_PLT32 24 /* Direct 32 bit ref to PLT entry */ -+#define R_SPARC_HIPLT22 25 /* High 22 bit PLT entry */ -+#define R_SPARC_LOPLT10 26 /* Truncated 10 bit PLT entry */ -+#define R_SPARC_PCPLT32 27 /* PC rel 32 bit ref to PLT entry */ -+#define R_SPARC_PCPLT22 28 /* PC rel high 22 bit PLT entry */ -+#define R_SPARC_PCPLT10 29 /* PC rel trunc 10 bit PLT entry */ -+#define R_SPARC_10 30 /* Direct 10 bit */ -+#define R_SPARC_11 31 /* Direct 11 bit */ -+#define R_SPARC_64 32 /* Direct 64 bit */ -+#define R_SPARC_OLO10 33 /* 10bit with secondary 13bit addend */ -+#define R_SPARC_HH22 34 /* Top 22 bits of direct 64 bit */ -+#define R_SPARC_HM10 35 /* High middle 10 bits of ... */ -+#define R_SPARC_LM22 36 /* Low middle 22 bits of ... */ -+#define R_SPARC_PC_HH22 37 /* Top 22 bits of pc rel 64 bit */ -+#define R_SPARC_PC_HM10 38 /* High middle 10 bit of ... */ -+#define R_SPARC_PC_LM22 39 /* Low miggle 22 bits of ... */ -+#define R_SPARC_WDISP16 40 /* PC relative 16 bit shifted */ -+#define R_SPARC_WDISP19 41 /* PC relative 19 bit shifted */ -+#define R_SPARC_GLOB_JMP 42 /* was part of v9 ABI but was removed */ -+#define R_SPARC_7 43 /* Direct 7 bit */ -+#define R_SPARC_5 44 /* Direct 5 bit */ -+#define R_SPARC_6 45 /* Direct 6 bit */ -+#define R_SPARC_DISP64 46 /* PC relative 64 bit */ -+#define R_SPARC_PLT64 47 /* Direct 64 bit ref to PLT entry */ -+#define R_SPARC_HIX22 48 /* High 22 bit complemented */ -+#define R_SPARC_LOX10 49 /* Truncated 11 bit complemented */ -+#define R_SPARC_H44 50 /* Direct high 12 of 44 bit */ -+#define R_SPARC_M44 51 /* Direct mid 22 of 44 bit */ -+#define R_SPARC_L44 52 /* Direct low 10 of 44 bit */ -+#define R_SPARC_REGISTER 53 /* Global register usage */ -+#define R_SPARC_UA64 54 /* Direct 64 bit unaligned */ -+#define R_SPARC_UA16 55 /* Direct 16 bit unaligned */ -+#define R_SPARC_TLS_GD_HI22 56 -+#define R_SPARC_TLS_GD_LO10 57 -+#define R_SPARC_TLS_GD_ADD 58 -+#define R_SPARC_TLS_GD_CALL 59 -+#define R_SPARC_TLS_LDM_HI22 60 -+#define R_SPARC_TLS_LDM_LO10 61 -+#define R_SPARC_TLS_LDM_ADD 62 -+#define R_SPARC_TLS_LDM_CALL 63 -+#define R_SPARC_TLS_LDO_HIX22 64 -+#define R_SPARC_TLS_LDO_LOX10 65 -+#define R_SPARC_TLS_LDO_ADD 66 -+#define R_SPARC_TLS_IE_HI22 67 -+#define R_SPARC_TLS_IE_LO10 68 -+#define R_SPARC_TLS_IE_LD 69 -+#define R_SPARC_TLS_IE_LDX 70 -+#define R_SPARC_TLS_IE_ADD 71 -+#define R_SPARC_TLS_LE_HIX22 72 -+#define R_SPARC_TLS_LE_LOX10 73 -+#define R_SPARC_TLS_DTPMOD32 74 -+#define R_SPARC_TLS_DTPMOD64 75 -+#define R_SPARC_TLS_DTPOFF32 76 -+#define R_SPARC_TLS_DTPOFF64 77 -+#define R_SPARC_TLS_TPOFF32 78 -+#define R_SPARC_TLS_TPOFF64 79 -+#define R_SPARC_GOTDATA_HIX22 80 -+#define R_SPARC_GOTDATA_LOX10 81 -+#define R_SPARC_GOTDATA_OP_HIX22 82 -+#define R_SPARC_GOTDATA_OP_LOX10 83 -+#define R_SPARC_GOTDATA_OP 84 -+#define R_SPARC_H34 85 -+#define R_SPARC_SIZE32 86 -+#define R_SPARC_SIZE64 87 -+#define R_SPARC_WDISP10 88 -+#define R_SPARC_JMP_IREL 248 -+#define R_SPARC_IRELATIVE 249 -+#define R_SPARC_GNU_VTINHERIT 250 -+#define R_SPARC_GNU_VTENTRY 251 -+#define R_SPARC_REV32 252 -+/* Keep this the last entry. */ -+#define R_SPARC_NUM 253 -+ -+/* For Sparc64, legal values for d_tag of Elf64_Dyn. */ -+ -+#define DT_SPARC_REGISTER 0x70000001 -+#define DT_SPARC_NUM 2 -+ -+/* MIPS R3000 specific definitions. */ -+ -+/* Legal values for e_flags field of Elf32_Ehdr. */ -+ -+#define EF_MIPS_NOREORDER 1 /* A .noreorder directive was used */ -+#define EF_MIPS_PIC 2 /* Contains PIC code */ -+#define EF_MIPS_CPIC 4 /* Uses PIC calling sequence */ -+#define EF_MIPS_XGOT 8 -+#define EF_MIPS_64BIT_WHIRL 16 -+#define EF_MIPS_ABI2 32 -+#define EF_MIPS_ABI_ON32 64 -+#define EF_MIPS_ARCH 0xf0000000 /* MIPS architecture level */ -+ -+/* Legal values for MIPS architecture level. */ -+ -+#define EF_MIPS_ARCH_1 0x00000000 /* -mips1 code. */ -+#define EF_MIPS_ARCH_2 0x10000000 /* -mips2 code. */ -+#define EF_MIPS_ARCH_3 0x20000000 /* -mips3 code. */ -+#define EF_MIPS_ARCH_4 0x30000000 /* -mips4 code. */ -+#define EF_MIPS_ARCH_5 0x40000000 /* -mips5 code. */ -+#define EF_MIPS_ARCH_32 0x60000000 /* MIPS32 code. */ -+#define EF_MIPS_ARCH_64 0x70000000 /* MIPS64 code. */ -+ -+/* The following are non-official names and should not be used. */ -+ -+#define E_MIPS_ARCH_1 0x00000000 /* -mips1 code. */ -+#define E_MIPS_ARCH_2 0x10000000 /* -mips2 code. */ -+#define E_MIPS_ARCH_3 0x20000000 /* -mips3 code. */ -+#define E_MIPS_ARCH_4 0x30000000 /* -mips4 code. */ -+#define E_MIPS_ARCH_5 0x40000000 /* -mips5 code. */ -+#define E_MIPS_ARCH_32 0x60000000 /* MIPS32 code. */ -+#define E_MIPS_ARCH_64 0x70000000 /* MIPS64 code. */ -+ -+/* Special section indices. */ -+ -+#define SHN_MIPS_ACOMMON 0xff00 /* Allocated common symbols */ -+#define SHN_MIPS_TEXT 0xff01 /* Allocated test symbols. */ -+#define SHN_MIPS_DATA 0xff02 /* Allocated data symbols. */ -+#define SHN_MIPS_SCOMMON 0xff03 /* Small common symbols */ -+#define SHN_MIPS_SUNDEFINED 0xff04 /* Small undefined symbols */ -+ -+/* Legal values for sh_type field of Elf32_Shdr. */ -+ -+#define SHT_MIPS_LIBLIST 0x70000000 /* Shared objects used in link */ -+#define SHT_MIPS_MSYM 0x70000001 -+#define SHT_MIPS_CONFLICT 0x70000002 /* Conflicting symbols */ -+#define SHT_MIPS_GPTAB 0x70000003 /* Global data area sizes */ -+#define SHT_MIPS_UCODE 0x70000004 /* Reserved for SGI/MIPS compilers */ -+#define SHT_MIPS_DEBUG 0x70000005 /* MIPS ECOFF debugging information*/ -+#define SHT_MIPS_REGINFO 0x70000006 /* Register usage information */ -+#define SHT_MIPS_PACKAGE 0x70000007 -+#define SHT_MIPS_PACKSYM 0x70000008 -+#define SHT_MIPS_RELD 0x70000009 -+#define SHT_MIPS_IFACE 0x7000000b -+#define SHT_MIPS_CONTENT 0x7000000c -+#define SHT_MIPS_OPTIONS 0x7000000d /* Miscellaneous options. */ -+#define SHT_MIPS_SHDR 0x70000010 -+#define SHT_MIPS_FDESC 0x70000011 -+#define SHT_MIPS_EXTSYM 0x70000012 -+#define SHT_MIPS_DENSE 0x70000013 -+#define SHT_MIPS_PDESC 0x70000014 -+#define SHT_MIPS_LOCSYM 0x70000015 -+#define SHT_MIPS_AUXSYM 0x70000016 -+#define SHT_MIPS_OPTSYM 0x70000017 -+#define SHT_MIPS_LOCSTR 0x70000018 -+#define SHT_MIPS_LINE 0x70000019 -+#define SHT_MIPS_RFDESC 0x7000001a -+#define SHT_MIPS_DELTASYM 0x7000001b -+#define SHT_MIPS_DELTAINST 0x7000001c -+#define SHT_MIPS_DELTACLASS 0x7000001d -+#define SHT_MIPS_DWARF 0x7000001e /* DWARF debugging information. */ -+#define SHT_MIPS_DELTADECL 0x7000001f -+#define SHT_MIPS_SYMBOL_LIB 0x70000020 -+#define SHT_MIPS_EVENTS 0x70000021 /* Event section. */ -+#define SHT_MIPS_TRANSLATE 0x70000022 -+#define SHT_MIPS_PIXIE 0x70000023 -+#define SHT_MIPS_XLATE 0x70000024 -+#define SHT_MIPS_XLATE_DEBUG 0x70000025 -+#define SHT_MIPS_WHIRL 0x70000026 -+#define SHT_MIPS_EH_REGION 0x70000027 -+#define SHT_MIPS_XLATE_OLD 0x70000028 -+#define SHT_MIPS_PDR_EXCEPTION 0x70000029 -+ -+/* Legal values for sh_flags field of Elf32_Shdr. */ -+ -+#define SHF_MIPS_GPREL 0x10000000 /* Must be part of global data area */ -+#define SHF_MIPS_MERGE 0x20000000 -+#define SHF_MIPS_ADDR 0x40000000 -+#define SHF_MIPS_STRINGS 0x80000000 -+#define SHF_MIPS_NOSTRIP 0x08000000 -+#define SHF_MIPS_LOCAL 0x04000000 -+#define SHF_MIPS_NAMES 0x02000000 -+#define SHF_MIPS_NODUPE 0x01000000 -+ -+ -+/* Symbol tables. */ -+ -+/* MIPS specific values for `st_other'. */ -+#define STO_MIPS_DEFAULT 0x0 -+#define STO_MIPS_INTERNAL 0x1 -+#define STO_MIPS_HIDDEN 0x2 -+#define STO_MIPS_PROTECTED 0x3 -+#define STO_MIPS_PLT 0x8 -+#define STO_MIPS_SC_ALIGN_UNUSED 0xff -+ -+/* MIPS specific values for `st_info'. */ -+#define STB_MIPS_SPLIT_COMMON 13 -+ -+/* Entries found in sections of type SHT_MIPS_GPTAB. */ -+ -+typedef union -+{ -+ struct -+ { -+ Elf32_Word gt_current_g_value; /* -G value used for compilation */ -+ Elf32_Word gt_unused; /* Not used */ -+ } gt_header; /* First entry in section */ -+ struct -+ { -+ Elf32_Word gt_g_value; /* If this value were used for -G */ -+ Elf32_Word gt_bytes; /* This many bytes would be used */ -+ } gt_entry; /* Subsequent entries in section */ -+} Elf32_gptab; -+ -+/* Entry found in sections of type SHT_MIPS_REGINFO. */ -+ -+typedef struct -+{ -+ Elf32_Word ri_gprmask; /* General registers used */ -+ Elf32_Word ri_cprmask[4]; /* Coprocessor registers used */ -+ Elf32_Sword ri_gp_value; /* $gp register value */ -+} Elf32_RegInfo; -+ -+/* Entries found in sections of type SHT_MIPS_OPTIONS. */ -+ -+typedef struct -+{ -+ unsigned char kind; /* Determines interpretation of the -+ variable part of descriptor. */ -+ unsigned char size; /* Size of descriptor, including header. */ -+ Elf32_Section section; /* Section header index of section affected, -+ 0 for global options. */ -+ Elf32_Word info; /* Kind-specific information. */ -+} Elf_Options; -+ -+/* Values for `kind' field in Elf_Options. */ -+ -+#define ODK_NULL 0 /* Undefined. */ -+#define ODK_REGINFO 1 /* Register usage information. */ -+#define ODK_EXCEPTIONS 2 /* Exception processing options. */ -+#define ODK_PAD 3 /* Section padding options. */ -+#define ODK_HWPATCH 4 /* Hardware workarounds performed */ -+#define ODK_FILL 5 /* record the fill value used by the linker. */ -+#define ODK_TAGS 6 /* reserve space for desktop tools to write. */ -+#define ODK_HWAND 7 /* HW workarounds. 'AND' bits when merging. */ -+#define ODK_HWOR 8 /* HW workarounds. 'OR' bits when merging. */ -+ -+/* Values for `info' in Elf_Options for ODK_EXCEPTIONS entries. */ -+ -+#define OEX_FPU_MIN 0x1f /* FPE's which MUST be enabled. */ -+#define OEX_FPU_MAX 0x1f00 /* FPE's which MAY be enabled. */ -+#define OEX_PAGE0 0x10000 /* page zero must be mapped. */ -+#define OEX_SMM 0x20000 /* Force sequential memory mode? */ -+#define OEX_FPDBUG 0x40000 /* Force floating point debug mode? */ -+#define OEX_PRECISEFP OEX_FPDBUG -+#define OEX_DISMISS 0x80000 /* Dismiss invalid address faults? */ -+ -+#define OEX_FPU_INVAL 0x10 -+#define OEX_FPU_DIV0 0x08 -+#define OEX_FPU_OFLO 0x04 -+#define OEX_FPU_UFLO 0x02 -+#define OEX_FPU_INEX 0x01 -+ -+/* Masks for `info' in Elf_Options for an ODK_HWPATCH entry. */ -+ -+#define OHW_R4KEOP 0x1 /* R4000 end-of-page patch. */ -+#define OHW_R8KPFETCH 0x2 /* may need R8000 prefetch patch. */ -+#define OHW_R5KEOP 0x4 /* R5000 end-of-page patch. */ -+#define OHW_R5KCVTL 0x8 /* R5000 cvt.[ds].l bug. clean=1. */ -+ -+#define OPAD_PREFIX 0x1 -+#define OPAD_POSTFIX 0x2 -+#define OPAD_SYMBOL 0x4 -+ -+/* Entry found in `.options' section. */ -+ -+typedef struct -+{ -+ Elf32_Word hwp_flags1; /* Extra flags. */ -+ Elf32_Word hwp_flags2; /* Extra flags. */ -+} Elf_Options_Hw; -+ -+/* Masks for `info' in ElfOptions for ODK_HWAND and ODK_HWOR entries. */ -+ -+#define OHWA0_R4KEOP_CHECKED 0x00000001 -+#define OHWA1_R4KEOP_CLEAN 0x00000002 -+ -+/* MIPS relocs. */ -+ -+#define R_MIPS_NONE 0 /* No reloc */ -+#define R_MIPS_16 1 /* Direct 16 bit */ -+#define R_MIPS_32 2 /* Direct 32 bit */ -+#define R_MIPS_REL32 3 /* PC relative 32 bit */ -+#define R_MIPS_26 4 /* Direct 26 bit shifted */ -+#define R_MIPS_HI16 5 /* High 16 bit */ -+#define R_MIPS_LO16 6 /* Low 16 bit */ -+#define R_MIPS_GPREL16 7 /* GP relative 16 bit */ -+#define R_MIPS_LITERAL 8 /* 16 bit literal entry */ -+#define R_MIPS_GOT16 9 /* 16 bit GOT entry */ -+#define R_MIPS_PC16 10 /* PC relative 16 bit */ -+#define R_MIPS_CALL16 11 /* 16 bit GOT entry for function */ -+#define R_MIPS_GPREL32 12 /* GP relative 32 bit */ -+ -+#define R_MIPS_SHIFT5 16 -+#define R_MIPS_SHIFT6 17 -+#define R_MIPS_64 18 -+#define R_MIPS_GOT_DISP 19 -+#define R_MIPS_GOT_PAGE 20 -+#define R_MIPS_GOT_OFST 21 -+#define R_MIPS_GOT_HI16 22 -+#define R_MIPS_GOT_LO16 23 -+#define R_MIPS_SUB 24 -+#define R_MIPS_INSERT_A 25 -+#define R_MIPS_INSERT_B 26 -+#define R_MIPS_DELETE 27 -+#define R_MIPS_HIGHER 28 -+#define R_MIPS_HIGHEST 29 -+#define R_MIPS_CALL_HI16 30 -+#define R_MIPS_CALL_LO16 31 -+#define R_MIPS_SCN_DISP 32 -+#define R_MIPS_REL16 33 -+#define R_MIPS_ADD_IMMEDIATE 34 -+#define R_MIPS_PJUMP 35 -+#define R_MIPS_RELGOT 36 -+#define R_MIPS_JALR 37 -+#define R_MIPS_TLS_DTPMOD32 38 /* Module number 32 bit */ -+#define R_MIPS_TLS_DTPREL32 39 /* Module-relative offset 32 bit */ -+#define R_MIPS_TLS_DTPMOD64 40 /* Module number 64 bit */ -+#define R_MIPS_TLS_DTPREL64 41 /* Module-relative offset 64 bit */ -+#define R_MIPS_TLS_GD 42 /* 16 bit GOT offset for GD */ -+#define R_MIPS_TLS_LDM 43 /* 16 bit GOT offset for LDM */ -+#define R_MIPS_TLS_DTPREL_HI16 44 /* Module-relative offset, high 16 bits */ -+#define R_MIPS_TLS_DTPREL_LO16 45 /* Module-relative offset, low 16 bits */ -+#define R_MIPS_TLS_GOTTPREL 46 /* 16 bit GOT offset for IE */ -+#define R_MIPS_TLS_TPREL32 47 /* TP-relative offset, 32 bit */ -+#define R_MIPS_TLS_TPREL64 48 /* TP-relative offset, 64 bit */ -+#define R_MIPS_TLS_TPREL_HI16 49 /* TP-relative offset, high 16 bits */ -+#define R_MIPS_TLS_TPREL_LO16 50 /* TP-relative offset, low 16 bits */ -+#define R_MIPS_GLOB_DAT 51 -+#define R_MIPS_COPY 126 -+#define R_MIPS_JUMP_SLOT 127 -+/* Keep this the last entry. */ -+#define R_MIPS_NUM 128 -+ -+/* Legal values for p_type field of Elf32_Phdr. */ -+ -+#define PT_MIPS_REGINFO 0x70000000 /* Register usage information */ -+#define PT_MIPS_RTPROC 0x70000001 /* Runtime procedure table. */ -+#define PT_MIPS_OPTIONS 0x70000002 -+ -+/* Special program header types. */ -+ -+#define PF_MIPS_LOCAL 0x10000000 -+ -+/* Legal values for d_tag field of Elf32_Dyn. */ -+ -+#define DT_MIPS_RLD_VERSION 0x70000001 /* Runtime linker interface version */ -+#define DT_MIPS_TIME_STAMP 0x70000002 /* Timestamp */ -+#define DT_MIPS_ICHECKSUM 0x70000003 /* Checksum */ -+#define DT_MIPS_IVERSION 0x70000004 /* Version string (string tbl index) */ -+#define DT_MIPS_FLAGS 0x70000005 /* Flags */ -+#define DT_MIPS_BASE_ADDRESS 0x70000006 /* Base address */ -+#define DT_MIPS_MSYM 0x70000007 -+#define DT_MIPS_CONFLICT 0x70000008 /* Address of CONFLICT section */ -+#define DT_MIPS_LIBLIST 0x70000009 /* Address of LIBLIST section */ -+#define DT_MIPS_LOCAL_GOTNO 0x7000000a /* Number of local GOT entries */ -+#define DT_MIPS_CONFLICTNO 0x7000000b /* Number of CONFLICT entries */ -+#define DT_MIPS_LIBLISTNO 0x70000010 /* Number of LIBLIST entries */ -+#define DT_MIPS_SYMTABNO 0x70000011 /* Number of DYNSYM entries */ -+#define DT_MIPS_UNREFEXTNO 0x70000012 /* First external DYNSYM */ -+#define DT_MIPS_GOTSYM 0x70000013 /* First GOT entry in DYNSYM */ -+#define DT_MIPS_HIPAGENO 0x70000014 /* Number of GOT page table entries */ -+#define DT_MIPS_RLD_MAP 0x70000016 /* Address of run time loader map. */ -+#define DT_MIPS_DELTA_CLASS 0x70000017 /* Delta C++ class definition. */ -+#define DT_MIPS_DELTA_CLASS_NO 0x70000018 /* Number of entries in -+ DT_MIPS_DELTA_CLASS. */ -+#define DT_MIPS_DELTA_INSTANCE 0x70000019 /* Delta C++ class instances. */ -+#define DT_MIPS_DELTA_INSTANCE_NO 0x7000001a /* Number of entries in -+ DT_MIPS_DELTA_INSTANCE. */ -+#define DT_MIPS_DELTA_RELOC 0x7000001b /* Delta relocations. */ -+#define DT_MIPS_DELTA_RELOC_NO 0x7000001c /* Number of entries in -+ DT_MIPS_DELTA_RELOC. */ -+#define DT_MIPS_DELTA_SYM 0x7000001d /* Delta symbols that Delta -+ relocations refer to. */ -+#define DT_MIPS_DELTA_SYM_NO 0x7000001e /* Number of entries in -+ DT_MIPS_DELTA_SYM. */ -+#define DT_MIPS_DELTA_CLASSSYM 0x70000020 /* Delta symbols that hold the -+ class declaration. */ -+#define DT_MIPS_DELTA_CLASSSYM_NO 0x70000021 /* Number of entries in -+ DT_MIPS_DELTA_CLASSSYM. */ -+#define DT_MIPS_CXX_FLAGS 0x70000022 /* Flags indicating for C++ flavor. */ -+#define DT_MIPS_PIXIE_INIT 0x70000023 -+#define DT_MIPS_SYMBOL_LIB 0x70000024 -+#define DT_MIPS_LOCALPAGE_GOTIDX 0x70000025 -+#define DT_MIPS_LOCAL_GOTIDX 0x70000026 -+#define DT_MIPS_HIDDEN_GOTIDX 0x70000027 -+#define DT_MIPS_PROTECTED_GOTIDX 0x70000028 -+#define DT_MIPS_OPTIONS 0x70000029 /* Address of .options. */ -+#define DT_MIPS_INTERFACE 0x7000002a /* Address of .interface. */ -+#define DT_MIPS_DYNSTR_ALIGN 0x7000002b -+#define DT_MIPS_INTERFACE_SIZE 0x7000002c /* Size of the .interface section. */ -+#define DT_MIPS_RLD_TEXT_RESOLVE_ADDR 0x7000002d /* Address of rld_text_rsolve -+ function stored in GOT. */ -+#define DT_MIPS_PERF_SUFFIX 0x7000002e /* Default suffix of dso to be added -+ by rld on dlopen() calls. */ -+#define DT_MIPS_COMPACT_SIZE 0x7000002f /* (O32)Size of compact rel section. */ -+#define DT_MIPS_GP_VALUE 0x70000030 /* GP value for aux GOTs. */ -+#define DT_MIPS_AUX_DYNAMIC 0x70000031 /* Address of aux .dynamic. */ -+/* The address of .got.plt in an executable using the new non-PIC ABI. */ -+#define DT_MIPS_PLTGOT 0x70000032 -+/* The base of the PLT in an executable using the new non-PIC ABI if that -+ PLT is writable. For a non-writable PLT, this is omitted or has a zero -+ value. */ -+#define DT_MIPS_RWPLT 0x70000034 -+#define DT_MIPS_NUM 0x35 -+ -+/* Legal values for DT_MIPS_FLAGS Elf32_Dyn entry. */ -+ -+#define RHF_NONE 0 /* No flags */ -+#define RHF_QUICKSTART (1 << 0) /* Use quickstart */ -+#define RHF_NOTPOT (1 << 1) /* Hash size not power of 2 */ -+#define RHF_NO_LIBRARY_REPLACEMENT (1 << 2) /* Ignore LD_LIBRARY_PATH */ -+#define RHF_NO_MOVE (1 << 3) -+#define RHF_SGI_ONLY (1 << 4) -+#define RHF_GUARANTEE_INIT (1 << 5) -+#define RHF_DELTA_C_PLUS_PLUS (1 << 6) -+#define RHF_GUARANTEE_START_INIT (1 << 7) -+#define RHF_PIXIE (1 << 8) -+#define RHF_DEFAULT_DELAY_LOAD (1 << 9) -+#define RHF_REQUICKSTART (1 << 10) -+#define RHF_REQUICKSTARTED (1 << 11) -+#define RHF_CORD (1 << 12) -+#define RHF_NO_UNRES_UNDEF (1 << 13) -+#define RHF_RLD_ORDER_SAFE (1 << 14) -+ -+/* Entries found in sections of type SHT_MIPS_LIBLIST. */ -+ -+typedef struct -+{ -+ Elf32_Word l_name; /* Name (string table index) */ -+ Elf32_Word l_time_stamp; /* Timestamp */ -+ Elf32_Word l_checksum; /* Checksum */ -+ Elf32_Word l_version; /* Interface version */ -+ Elf32_Word l_flags; /* Flags */ -+} Elf32_Lib; -+ -+typedef struct -+{ -+ Elf64_Word l_name; /* Name (string table index) */ -+ Elf64_Word l_time_stamp; /* Timestamp */ -+ Elf64_Word l_checksum; /* Checksum */ -+ Elf64_Word l_version; /* Interface version */ -+ Elf64_Word l_flags; /* Flags */ -+} Elf64_Lib; -+ -+ -+/* Legal values for l_flags. */ -+ -+#define LL_NONE 0 -+#define LL_EXACT_MATCH (1 << 0) /* Require exact match */ -+#define LL_IGNORE_INT_VER (1 << 1) /* Ignore interface version */ -+#define LL_REQUIRE_MINOR (1 << 2) -+#define LL_EXPORTS (1 << 3) -+#define LL_DELAY_LOAD (1 << 4) -+#define LL_DELTA (1 << 5) -+ -+/* Entries found in sections of type SHT_MIPS_CONFLICT. */ -+ -+typedef Elf32_Addr Elf32_Conflict; -+ -+ -+/* HPPA specific definitions. */ -+ -+/* Legal values for e_flags field of Elf32_Ehdr. */ -+ -+#define EF_PARISC_TRAPNIL 0x00010000 /* Trap nil pointer dereference. */ -+#define EF_PARISC_EXT 0x00020000 /* Program uses arch. extensions. */ -+#define EF_PARISC_LSB 0x00040000 /* Program expects little endian. */ -+#define EF_PARISC_WIDE 0x00080000 /* Program expects wide mode. */ -+#define EF_PARISC_NO_KABP 0x00100000 /* No kernel assisted branch -+ prediction. */ -+#define EF_PARISC_LAZYSWAP 0x00400000 /* Allow lazy swapping. */ -+#define EF_PARISC_ARCH 0x0000ffff /* Architecture version. */ -+ -+/* Defined values for `e_flags & EF_PARISC_ARCH' are: */ -+ -+#define EFA_PARISC_1_0 0x020b /* PA-RISC 1.0 big-endian. */ -+#define EFA_PARISC_1_1 0x0210 /* PA-RISC 1.1 big-endian. */ -+#define EFA_PARISC_2_0 0x0214 /* PA-RISC 2.0 big-endian. */ -+ -+/* Additional section indeces. */ -+ -+#define SHN_PARISC_ANSI_COMMON 0xff00 /* Section for tenatively declared -+ symbols in ANSI C. */ -+#define SHN_PARISC_HUGE_COMMON 0xff01 /* Common blocks in huge model. */ -+ -+/* Legal values for sh_type field of Elf32_Shdr. */ -+ -+#define SHT_PARISC_EXT 0x70000000 /* Contains product specific ext. */ -+#define SHT_PARISC_UNWIND 0x70000001 /* Unwind information. */ -+#define SHT_PARISC_DOC 0x70000002 /* Debug info for optimized code. */ -+ -+/* Legal values for sh_flags field of Elf32_Shdr. */ -+ -+#define SHF_PARISC_SHORT 0x20000000 /* Section with short addressing. */ -+#define SHF_PARISC_HUGE 0x40000000 /* Section far from gp. */ -+#define SHF_PARISC_SBP 0x80000000 /* Static branch prediction code. */ -+ -+/* Legal values for ST_TYPE subfield of st_info (symbol type). */ -+ -+#define STT_PARISC_MILLICODE 13 /* Millicode function entry point. */ -+ -+#define STT_HP_OPAQUE (STT_LOOS + 0x1) -+#define STT_HP_STUB (STT_LOOS + 0x2) -+ -+/* HPPA relocs. */ -+ -+#define R_PARISC_NONE 0 /* No reloc. */ -+#define R_PARISC_DIR32 1 /* Direct 32-bit reference. */ -+#define R_PARISC_DIR21L 2 /* Left 21 bits of eff. address. */ -+#define R_PARISC_DIR17R 3 /* Right 17 bits of eff. address. */ -+#define R_PARISC_DIR17F 4 /* 17 bits of eff. address. */ -+#define R_PARISC_DIR14R 6 /* Right 14 bits of eff. address. */ -+#define R_PARISC_PCREL32 9 /* 32-bit rel. address. */ -+#define R_PARISC_PCREL21L 10 /* Left 21 bits of rel. address. */ -+#define R_PARISC_PCREL17R 11 /* Right 17 bits of rel. address. */ -+#define R_PARISC_PCREL17F 12 /* 17 bits of rel. address. */ -+#define R_PARISC_PCREL14R 14 /* Right 14 bits of rel. address. */ -+#define R_PARISC_DPREL21L 18 /* Left 21 bits of rel. address. */ -+#define R_PARISC_DPREL14R 22 /* Right 14 bits of rel. address. */ -+#define R_PARISC_GPREL21L 26 /* GP-relative, left 21 bits. */ -+#define R_PARISC_GPREL14R 30 /* GP-relative, right 14 bits. */ -+#define R_PARISC_LTOFF21L 34 /* LT-relative, left 21 bits. */ -+#define R_PARISC_LTOFF14R 38 /* LT-relative, right 14 bits. */ -+#define R_PARISC_SECREL32 41 /* 32 bits section rel. address. */ -+#define R_PARISC_SEGBASE 48 /* No relocation, set segment base. */ -+#define R_PARISC_SEGREL32 49 /* 32 bits segment rel. address. */ -+#define R_PARISC_PLTOFF21L 50 /* PLT rel. address, left 21 bits. */ -+#define R_PARISC_PLTOFF14R 54 /* PLT rel. address, right 14 bits. */ -+#define R_PARISC_LTOFF_FPTR32 57 /* 32 bits LT-rel. function pointer. */ -+#define R_PARISC_LTOFF_FPTR21L 58 /* LT-rel. fct ptr, left 21 bits. */ -+#define R_PARISC_LTOFF_FPTR14R 62 /* LT-rel. fct ptr, right 14 bits. */ -+#define R_PARISC_FPTR64 64 /* 64 bits function address. */ -+#define R_PARISC_PLABEL32 65 /* 32 bits function address. */ -+#define R_PARISC_PLABEL21L 66 /* Left 21 bits of fdesc address. */ -+#define R_PARISC_PLABEL14R 70 /* Right 14 bits of fdesc address. */ -+#define R_PARISC_PCREL64 72 /* 64 bits PC-rel. address. */ -+#define R_PARISC_PCREL22F 74 /* 22 bits PC-rel. address. */ -+#define R_PARISC_PCREL14WR 75 /* PC-rel. address, right 14 bits. */ -+#define R_PARISC_PCREL14DR 76 /* PC rel. address, right 14 bits. */ -+#define R_PARISC_PCREL16F 77 /* 16 bits PC-rel. address. */ -+#define R_PARISC_PCREL16WF 78 /* 16 bits PC-rel. address. */ -+#define R_PARISC_PCREL16DF 79 /* 16 bits PC-rel. address. */ -+#define R_PARISC_DIR64 80 /* 64 bits of eff. address. */ -+#define R_PARISC_DIR14WR 83 /* 14 bits of eff. address. */ -+#define R_PARISC_DIR14DR 84 /* 14 bits of eff. address. */ -+#define R_PARISC_DIR16F 85 /* 16 bits of eff. address. */ -+#define R_PARISC_DIR16WF 86 /* 16 bits of eff. address. */ -+#define R_PARISC_DIR16DF 87 /* 16 bits of eff. address. */ -+#define R_PARISC_GPREL64 88 /* 64 bits of GP-rel. address. */ -+#define R_PARISC_GPREL14WR 91 /* GP-rel. address, right 14 bits. */ -+#define R_PARISC_GPREL14DR 92 /* GP-rel. address, right 14 bits. */ -+#define R_PARISC_GPREL16F 93 /* 16 bits GP-rel. address. */ -+#define R_PARISC_GPREL16WF 94 /* 16 bits GP-rel. address. */ -+#define R_PARISC_GPREL16DF 95 /* 16 bits GP-rel. address. */ -+#define R_PARISC_LTOFF64 96 /* 64 bits LT-rel. address. */ -+#define R_PARISC_LTOFF14WR 99 /* LT-rel. address, right 14 bits. */ -+#define R_PARISC_LTOFF14DR 100 /* LT-rel. address, right 14 bits. */ -+#define R_PARISC_LTOFF16F 101 /* 16 bits LT-rel. address. */ -+#define R_PARISC_LTOFF16WF 102 /* 16 bits LT-rel. address. */ -+#define R_PARISC_LTOFF16DF 103 /* 16 bits LT-rel. address. */ -+#define R_PARISC_SECREL64 104 /* 64 bits section rel. address. */ -+#define R_PARISC_SEGREL64 112 /* 64 bits segment rel. address. */ -+#define R_PARISC_PLTOFF14WR 115 /* PLT-rel. address, right 14 bits. */ -+#define R_PARISC_PLTOFF14DR 116 /* PLT-rel. address, right 14 bits. */ -+#define R_PARISC_PLTOFF16F 117 /* 16 bits LT-rel. address. */ -+#define R_PARISC_PLTOFF16WF 118 /* 16 bits PLT-rel. address. */ -+#define R_PARISC_PLTOFF16DF 119 /* 16 bits PLT-rel. address. */ -+#define R_PARISC_LTOFF_FPTR64 120 /* 64 bits LT-rel. function ptr. */ -+#define R_PARISC_LTOFF_FPTR14WR 123 /* LT-rel. fct. ptr., right 14 bits. */ -+#define R_PARISC_LTOFF_FPTR14DR 124 /* LT-rel. fct. ptr., right 14 bits. */ -+#define R_PARISC_LTOFF_FPTR16F 125 /* 16 bits LT-rel. function ptr. */ -+#define R_PARISC_LTOFF_FPTR16WF 126 /* 16 bits LT-rel. function ptr. */ -+#define R_PARISC_LTOFF_FPTR16DF 127 /* 16 bits LT-rel. function ptr. */ -+#define R_PARISC_LORESERVE 128 -+#define R_PARISC_COPY 128 /* Copy relocation. */ -+#define R_PARISC_IPLT 129 /* Dynamic reloc, imported PLT */ -+#define R_PARISC_EPLT 130 /* Dynamic reloc, exported PLT */ -+#define R_PARISC_TPREL32 153 /* 32 bits TP-rel. address. */ -+#define R_PARISC_TPREL21L 154 /* TP-rel. address, left 21 bits. */ -+#define R_PARISC_TPREL14R 158 /* TP-rel. address, right 14 bits. */ -+#define R_PARISC_LTOFF_TP21L 162 /* LT-TP-rel. address, left 21 bits. */ -+#define R_PARISC_LTOFF_TP14R 166 /* LT-TP-rel. address, right 14 bits.*/ -+#define R_PARISC_LTOFF_TP14F 167 /* 14 bits LT-TP-rel. address. */ -+#define R_PARISC_TPREL64 216 /* 64 bits TP-rel. address. */ -+#define R_PARISC_TPREL14WR 219 /* TP-rel. address, right 14 bits. */ -+#define R_PARISC_TPREL14DR 220 /* TP-rel. address, right 14 bits. */ -+#define R_PARISC_TPREL16F 221 /* 16 bits TP-rel. address. */ -+#define R_PARISC_TPREL16WF 222 /* 16 bits TP-rel. address. */ -+#define R_PARISC_TPREL16DF 223 /* 16 bits TP-rel. address. */ -+#define R_PARISC_LTOFF_TP64 224 /* 64 bits LT-TP-rel. address. */ -+#define R_PARISC_LTOFF_TP14WR 227 /* LT-TP-rel. address, right 14 bits.*/ -+#define R_PARISC_LTOFF_TP14DR 228 /* LT-TP-rel. address, right 14 bits.*/ -+#define R_PARISC_LTOFF_TP16F 229 /* 16 bits LT-TP-rel. address. */ -+#define R_PARISC_LTOFF_TP16WF 230 /* 16 bits LT-TP-rel. address. */ -+#define R_PARISC_LTOFF_TP16DF 231 /* 16 bits LT-TP-rel. address. */ -+#define R_PARISC_GNU_VTENTRY 232 -+#define R_PARISC_GNU_VTINHERIT 233 -+#define R_PARISC_TLS_GD21L 234 /* GD 21-bit left. */ -+#define R_PARISC_TLS_GD14R 235 /* GD 14-bit right. */ -+#define R_PARISC_TLS_GDCALL 236 /* GD call to __t_g_a. */ -+#define R_PARISC_TLS_LDM21L 237 /* LD module 21-bit left. */ -+#define R_PARISC_TLS_LDM14R 238 /* LD module 14-bit right. */ -+#define R_PARISC_TLS_LDMCALL 239 /* LD module call to __t_g_a. */ -+#define R_PARISC_TLS_LDO21L 240 /* LD offset 21-bit left. */ -+#define R_PARISC_TLS_LDO14R 241 /* LD offset 14-bit right. */ -+#define R_PARISC_TLS_DTPMOD32 242 /* DTP module 32-bit. */ -+#define R_PARISC_TLS_DTPMOD64 243 /* DTP module 64-bit. */ -+#define R_PARISC_TLS_DTPOFF32 244 /* DTP offset 32-bit. */ -+#define R_PARISC_TLS_DTPOFF64 245 /* DTP offset 32-bit. */ -+#define R_PARISC_TLS_LE21L R_PARISC_TPREL21L -+#define R_PARISC_TLS_LE14R R_PARISC_TPREL14R -+#define R_PARISC_TLS_IE21L R_PARISC_LTOFF_TP21L -+#define R_PARISC_TLS_IE14R R_PARISC_LTOFF_TP14R -+#define R_PARISC_TLS_TPREL32 R_PARISC_TPREL32 -+#define R_PARISC_TLS_TPREL64 R_PARISC_TPREL64 -+#define R_PARISC_HIRESERVE 255 -+ -+/* Legal values for p_type field of Elf32_Phdr/Elf64_Phdr. */ -+ -+#define PT_HP_TLS (PT_LOOS + 0x0) -+#define PT_HP_CORE_NONE (PT_LOOS + 0x1) -+#define PT_HP_CORE_VERSION (PT_LOOS + 0x2) -+#define PT_HP_CORE_KERNEL (PT_LOOS + 0x3) -+#define PT_HP_CORE_COMM (PT_LOOS + 0x4) -+#define PT_HP_CORE_PROC (PT_LOOS + 0x5) -+#define PT_HP_CORE_LOADABLE (PT_LOOS + 0x6) -+#define PT_HP_CORE_STACK (PT_LOOS + 0x7) -+#define PT_HP_CORE_SHM (PT_LOOS + 0x8) -+#define PT_HP_CORE_MMF (PT_LOOS + 0x9) -+#define PT_HP_PARALLEL (PT_LOOS + 0x10) -+#define PT_HP_FASTBIND (PT_LOOS + 0x11) -+#define PT_HP_OPT_ANNOT (PT_LOOS + 0x12) -+#define PT_HP_HSL_ANNOT (PT_LOOS + 0x13) -+#define PT_HP_STACK (PT_LOOS + 0x14) -+ -+#define PT_PARISC_ARCHEXT 0x70000000 -+#define PT_PARISC_UNWIND 0x70000001 -+ -+/* Legal values for p_flags field of Elf32_Phdr/Elf64_Phdr. */ -+ -+#define PF_PARISC_SBP 0x08000000 -+ -+#define PF_HP_PAGE_SIZE 0x00100000 -+#define PF_HP_FAR_SHARED 0x00200000 -+#define PF_HP_NEAR_SHARED 0x00400000 -+#define PF_HP_CODE 0x01000000 -+#define PF_HP_MODIFY 0x02000000 -+#define PF_HP_LAZYSWAP 0x04000000 -+#define PF_HP_SBP 0x08000000 -+ -+ -+/* Alpha specific definitions. */ -+ -+/* Legal values for e_flags field of Elf64_Ehdr. */ -+ -+#define EF_ALPHA_32BIT 1 /* All addresses must be < 2GB. */ -+#define EF_ALPHA_CANRELAX 2 /* Relocations for relaxing exist. */ -+ -+/* Legal values for sh_type field of Elf64_Shdr. */ -+ -+/* These two are primerily concerned with ECOFF debugging info. */ -+#define SHT_ALPHA_DEBUG 0x70000001 -+#define SHT_ALPHA_REGINFO 0x70000002 -+ -+/* Legal values for sh_flags field of Elf64_Shdr. */ -+ -+#define SHF_ALPHA_GPREL 0x10000000 -+ -+/* Legal values for st_other field of Elf64_Sym. */ -+#define STO_ALPHA_NOPV 0x80 /* No PV required. */ -+#define STO_ALPHA_STD_GPLOAD 0x88 /* PV only used for initial ldgp. */ -+ -+/* Alpha relocs. */ -+ -+#define R_ALPHA_NONE 0 /* No reloc */ -+#define R_ALPHA_REFLONG 1 /* Direct 32 bit */ -+#define R_ALPHA_REFQUAD 2 /* Direct 64 bit */ -+#define R_ALPHA_GPREL32 3 /* GP relative 32 bit */ -+#define R_ALPHA_LITERAL 4 /* GP relative 16 bit w/optimization */ -+#define R_ALPHA_LITUSE 5 /* Optimization hint for LITERAL */ -+#define R_ALPHA_GPDISP 6 /* Add displacement to GP */ -+#define R_ALPHA_BRADDR 7 /* PC+4 relative 23 bit shifted */ -+#define R_ALPHA_HINT 8 /* PC+4 relative 16 bit shifted */ -+#define R_ALPHA_SREL16 9 /* PC relative 16 bit */ -+#define R_ALPHA_SREL32 10 /* PC relative 32 bit */ -+#define R_ALPHA_SREL64 11 /* PC relative 64 bit */ -+#define R_ALPHA_GPRELHIGH 17 /* GP relative 32 bit, high 16 bits */ -+#define R_ALPHA_GPRELLOW 18 /* GP relative 32 bit, low 16 bits */ -+#define R_ALPHA_GPREL16 19 /* GP relative 16 bit */ -+#define R_ALPHA_COPY 24 /* Copy symbol at runtime */ -+#define R_ALPHA_GLOB_DAT 25 /* Create GOT entry */ -+#define R_ALPHA_JMP_SLOT 26 /* Create PLT entry */ -+#define R_ALPHA_RELATIVE 27 /* Adjust by program base */ -+#define R_ALPHA_TLS_GD_HI 28 -+#define R_ALPHA_TLSGD 29 -+#define R_ALPHA_TLS_LDM 30 -+#define R_ALPHA_DTPMOD64 31 -+#define R_ALPHA_GOTDTPREL 32 -+#define R_ALPHA_DTPREL64 33 -+#define R_ALPHA_DTPRELHI 34 -+#define R_ALPHA_DTPRELLO 35 -+#define R_ALPHA_DTPREL16 36 -+#define R_ALPHA_GOTTPREL 37 -+#define R_ALPHA_TPREL64 38 -+#define R_ALPHA_TPRELHI 39 -+#define R_ALPHA_TPRELLO 40 -+#define R_ALPHA_TPREL16 41 -+/* Keep this the last entry. */ -+#define R_ALPHA_NUM 46 -+ -+/* Magic values of the LITUSE relocation addend. */ -+#define LITUSE_ALPHA_ADDR 0 -+#define LITUSE_ALPHA_BASE 1 -+#define LITUSE_ALPHA_BYTOFF 2 -+#define LITUSE_ALPHA_JSR 3 -+#define LITUSE_ALPHA_TLS_GD 4 -+#define LITUSE_ALPHA_TLS_LDM 5 -+ -+/* Legal values for d_tag of Elf64_Dyn. */ -+#define DT_ALPHA_PLTRO (DT_LOPROC + 0) -+#define DT_ALPHA_NUM 1 -+ -+/* PowerPC specific declarations */ -+ -+/* Values for Elf32/64_Ehdr.e_flags. */ -+#define EF_PPC_EMB 0x80000000 /* PowerPC embedded flag */ -+ -+/* Cygnus local bits below */ -+#define EF_PPC_RELOCATABLE 0x00010000 /* PowerPC -mrelocatable flag*/ -+#define EF_PPC_RELOCATABLE_LIB 0x00008000 /* PowerPC -mrelocatable-lib -+ flag */ -+ -+/* PowerPC relocations defined by the ABIs */ -+#define R_PPC_NONE 0 -+#define R_PPC_ADDR32 1 /* 32bit absolute address */ -+#define R_PPC_ADDR24 2 /* 26bit address, 2 bits ignored. */ -+#define R_PPC_ADDR16 3 /* 16bit absolute address */ -+#define R_PPC_ADDR16_LO 4 /* lower 16bit of absolute address */ -+#define R_PPC_ADDR16_HI 5 /* high 16bit of absolute address */ -+#define R_PPC_ADDR16_HA 6 /* adjusted high 16bit */ -+#define R_PPC_ADDR14 7 /* 16bit address, 2 bits ignored */ -+#define R_PPC_ADDR14_BRTAKEN 8 -+#define R_PPC_ADDR14_BRNTAKEN 9 -+#define R_PPC_REL24 10 /* PC relative 26 bit */ -+#define R_PPC_REL14 11 /* PC relative 16 bit */ -+#define R_PPC_REL14_BRTAKEN 12 -+#define R_PPC_REL14_BRNTAKEN 13 -+#define R_PPC_GOT16 14 -+#define R_PPC_GOT16_LO 15 -+#define R_PPC_GOT16_HI 16 -+#define R_PPC_GOT16_HA 17 -+#define R_PPC_PLTREL24 18 -+#define R_PPC_COPY 19 -+#define R_PPC_GLOB_DAT 20 -+#define R_PPC_JMP_SLOT 21 -+#define R_PPC_RELATIVE 22 -+#define R_PPC_LOCAL24PC 23 -+#define R_PPC_UADDR32 24 -+#define R_PPC_UADDR16 25 -+#define R_PPC_REL32 26 -+#define R_PPC_PLT32 27 -+#define R_PPC_PLTREL32 28 -+#define R_PPC_PLT16_LO 29 -+#define R_PPC_PLT16_HI 30 -+#define R_PPC_PLT16_HA 31 -+#define R_PPC_SDAREL16 32 -+#define R_PPC_SECTOFF 33 -+#define R_PPC_SECTOFF_LO 34 -+#define R_PPC_SECTOFF_HI 35 -+#define R_PPC_SECTOFF_HA 36 -+ -+/* PowerPC relocations defined for the TLS access ABI. */ -+#define R_PPC_TLS 67 /* none (sym+add)@tls */ -+#define R_PPC_DTPMOD32 68 /* word32 (sym+add)@dtpmod */ -+#define R_PPC_TPREL16 69 /* half16* (sym+add)@tprel */ -+#define R_PPC_TPREL16_LO 70 /* half16 (sym+add)@tprel@l */ -+#define R_PPC_TPREL16_HI 71 /* half16 (sym+add)@tprel@h */ -+#define R_PPC_TPREL16_HA 72 /* half16 (sym+add)@tprel@ha */ -+#define R_PPC_TPREL32 73 /* word32 (sym+add)@tprel */ -+#define R_PPC_DTPREL16 74 /* half16* (sym+add)@dtprel */ -+#define R_PPC_DTPREL16_LO 75 /* half16 (sym+add)@dtprel@l */ -+#define R_PPC_DTPREL16_HI 76 /* half16 (sym+add)@dtprel@h */ -+#define R_PPC_DTPREL16_HA 77 /* half16 (sym+add)@dtprel@ha */ -+#define R_PPC_DTPREL32 78 /* word32 (sym+add)@dtprel */ -+#define R_PPC_GOT_TLSGD16 79 /* half16* (sym+add)@got@tlsgd */ -+#define R_PPC_GOT_TLSGD16_LO 80 /* half16 (sym+add)@got@tlsgd@l */ -+#define R_PPC_GOT_TLSGD16_HI 81 /* half16 (sym+add)@got@tlsgd@h */ -+#define R_PPC_GOT_TLSGD16_HA 82 /* half16 (sym+add)@got@tlsgd@ha */ -+#define R_PPC_GOT_TLSLD16 83 /* half16* (sym+add)@got@tlsld */ -+#define R_PPC_GOT_TLSLD16_LO 84 /* half16 (sym+add)@got@tlsld@l */ -+#define R_PPC_GOT_TLSLD16_HI 85 /* half16 (sym+add)@got@tlsld@h */ -+#define R_PPC_GOT_TLSLD16_HA 86 /* half16 (sym+add)@got@tlsld@ha */ -+#define R_PPC_GOT_TPREL16 87 /* half16* (sym+add)@got@tprel */ -+#define R_PPC_GOT_TPREL16_LO 88 /* half16 (sym+add)@got@tprel@l */ -+#define R_PPC_GOT_TPREL16_HI 89 /* half16 (sym+add)@got@tprel@h */ -+#define R_PPC_GOT_TPREL16_HA 90 /* half16 (sym+add)@got@tprel@ha */ -+#define R_PPC_GOT_DTPREL16 91 /* half16* (sym+add)@got@dtprel */ -+#define R_PPC_GOT_DTPREL16_LO 92 /* half16* (sym+add)@got@dtprel@l */ -+#define R_PPC_GOT_DTPREL16_HI 93 /* half16* (sym+add)@got@dtprel@h */ -+#define R_PPC_GOT_DTPREL16_HA 94 /* half16* (sym+add)@got@dtprel@ha */ -+ -+/* The remaining relocs are from the Embedded ELF ABI, and are not -+ in the SVR4 ELF ABI. */ -+#define R_PPC_EMB_NADDR32 101 -+#define R_PPC_EMB_NADDR16 102 -+#define R_PPC_EMB_NADDR16_LO 103 -+#define R_PPC_EMB_NADDR16_HI 104 -+#define R_PPC_EMB_NADDR16_HA 105 -+#define R_PPC_EMB_SDAI16 106 -+#define R_PPC_EMB_SDA2I16 107 -+#define R_PPC_EMB_SDA2REL 108 -+#define R_PPC_EMB_SDA21 109 /* 16 bit offset in SDA */ -+#define R_PPC_EMB_MRKREF 110 -+#define R_PPC_EMB_RELSEC16 111 -+#define R_PPC_EMB_RELST_LO 112 -+#define R_PPC_EMB_RELST_HI 113 -+#define R_PPC_EMB_RELST_HA 114 -+#define R_PPC_EMB_BIT_FLD 115 -+#define R_PPC_EMB_RELSDA 116 /* 16 bit relative offset in SDA */ -+ -+/* Diab tool relocations. */ -+#define R_PPC_DIAB_SDA21_LO 180 /* like EMB_SDA21, but lower 16 bit */ -+#define R_PPC_DIAB_SDA21_HI 181 /* like EMB_SDA21, but high 16 bit */ -+#define R_PPC_DIAB_SDA21_HA 182 /* like EMB_SDA21, adjusted high 16 */ -+#define R_PPC_DIAB_RELSDA_LO 183 /* like EMB_RELSDA, but lower 16 bit */ -+#define R_PPC_DIAB_RELSDA_HI 184 /* like EMB_RELSDA, but high 16 bit */ -+#define R_PPC_DIAB_RELSDA_HA 185 /* like EMB_RELSDA, adjusted high 16 */ -+ -+/* GNU extension to support local ifunc. */ -+#define R_PPC_IRELATIVE 248 -+ -+/* GNU relocs used in PIC code sequences. */ -+#define R_PPC_REL16 249 /* half16 (sym+add-.) */ -+#define R_PPC_REL16_LO 250 /* half16 (sym+add-.)@l */ -+#define R_PPC_REL16_HI 251 /* half16 (sym+add-.)@h */ -+#define R_PPC_REL16_HA 252 /* half16 (sym+add-.)@ha */ -+ -+/* This is a phony reloc to handle any old fashioned TOC16 references -+ that may still be in object files. */ -+#define R_PPC_TOC16 255 -+ -+/* PowerPC specific values for the Dyn d_tag field. */ -+#define DT_PPC_GOT (DT_LOPROC + 0) -+#define DT_PPC_NUM 1 -+ -+/* PowerPC64 relocations defined by the ABIs */ -+#define R_PPC64_NONE R_PPC_NONE -+#define R_PPC64_ADDR32 R_PPC_ADDR32 /* 32bit absolute address */ -+#define R_PPC64_ADDR24 R_PPC_ADDR24 /* 26bit address, word aligned */ -+#define R_PPC64_ADDR16 R_PPC_ADDR16 /* 16bit absolute address */ -+#define R_PPC64_ADDR16_LO R_PPC_ADDR16_LO /* lower 16bits of address */ -+#define R_PPC64_ADDR16_HI R_PPC_ADDR16_HI /* high 16bits of address. */ -+#define R_PPC64_ADDR16_HA R_PPC_ADDR16_HA /* adjusted high 16bits. */ -+#define R_PPC64_ADDR14 R_PPC_ADDR14 /* 16bit address, word aligned */ -+#define R_PPC64_ADDR14_BRTAKEN R_PPC_ADDR14_BRTAKEN -+#define R_PPC64_ADDR14_BRNTAKEN R_PPC_ADDR14_BRNTAKEN -+#define R_PPC64_REL24 R_PPC_REL24 /* PC-rel. 26 bit, word aligned */ -+#define R_PPC64_REL14 R_PPC_REL14 /* PC relative 16 bit */ -+#define R_PPC64_REL14_BRTAKEN R_PPC_REL14_BRTAKEN -+#define R_PPC64_REL14_BRNTAKEN R_PPC_REL14_BRNTAKEN -+#define R_PPC64_GOT16 R_PPC_GOT16 -+#define R_PPC64_GOT16_LO R_PPC_GOT16_LO -+#define R_PPC64_GOT16_HI R_PPC_GOT16_HI -+#define R_PPC64_GOT16_HA R_PPC_GOT16_HA -+ -+#define R_PPC64_COPY R_PPC_COPY -+#define R_PPC64_GLOB_DAT R_PPC_GLOB_DAT -+#define R_PPC64_JMP_SLOT R_PPC_JMP_SLOT -+#define R_PPC64_RELATIVE R_PPC_RELATIVE -+ -+#define R_PPC64_UADDR32 R_PPC_UADDR32 -+#define R_PPC64_UADDR16 R_PPC_UADDR16 -+#define R_PPC64_REL32 R_PPC_REL32 -+#define R_PPC64_PLT32 R_PPC_PLT32 -+#define R_PPC64_PLTREL32 R_PPC_PLTREL32 -+#define R_PPC64_PLT16_LO R_PPC_PLT16_LO -+#define R_PPC64_PLT16_HI R_PPC_PLT16_HI -+#define R_PPC64_PLT16_HA R_PPC_PLT16_HA -+ -+#define R_PPC64_SECTOFF R_PPC_SECTOFF -+#define R_PPC64_SECTOFF_LO R_PPC_SECTOFF_LO -+#define R_PPC64_SECTOFF_HI R_PPC_SECTOFF_HI -+#define R_PPC64_SECTOFF_HA R_PPC_SECTOFF_HA -+#define R_PPC64_ADDR30 37 /* word30 (S + A - P) >> 2 */ -+#define R_PPC64_ADDR64 38 /* doubleword64 S + A */ -+#define R_PPC64_ADDR16_HIGHER 39 /* half16 #higher(S + A) */ -+#define R_PPC64_ADDR16_HIGHERA 40 /* half16 #highera(S + A) */ -+#define R_PPC64_ADDR16_HIGHEST 41 /* half16 #highest(S + A) */ -+#define R_PPC64_ADDR16_HIGHESTA 42 /* half16 #highesta(S + A) */ -+#define R_PPC64_UADDR64 43 /* doubleword64 S + A */ -+#define R_PPC64_REL64 44 /* doubleword64 S + A - P */ -+#define R_PPC64_PLT64 45 /* doubleword64 L + A */ -+#define R_PPC64_PLTREL64 46 /* doubleword64 L + A - P */ -+#define R_PPC64_TOC16 47 /* half16* S + A - .TOC */ -+#define R_PPC64_TOC16_LO 48 /* half16 #lo(S + A - .TOC.) */ -+#define R_PPC64_TOC16_HI 49 /* half16 #hi(S + A - .TOC.) */ -+#define R_PPC64_TOC16_HA 50 /* half16 #ha(S + A - .TOC.) */ -+#define R_PPC64_TOC 51 /* doubleword64 .TOC */ -+#define R_PPC64_PLTGOT16 52 /* half16* M + A */ -+#define R_PPC64_PLTGOT16_LO 53 /* half16 #lo(M + A) */ -+#define R_PPC64_PLTGOT16_HI 54 /* half16 #hi(M + A) */ -+#define R_PPC64_PLTGOT16_HA 55 /* half16 #ha(M + A) */ -+ -+#define R_PPC64_ADDR16_DS 56 /* half16ds* (S + A) >> 2 */ -+#define R_PPC64_ADDR16_LO_DS 57 /* half16ds #lo(S + A) >> 2 */ -+#define R_PPC64_GOT16_DS 58 /* half16ds* (G + A) >> 2 */ -+#define R_PPC64_GOT16_LO_DS 59 /* half16ds #lo(G + A) >> 2 */ -+#define R_PPC64_PLT16_LO_DS 60 /* half16ds #lo(L + A) >> 2 */ -+#define R_PPC64_SECTOFF_DS 61 /* half16ds* (R + A) >> 2 */ -+#define R_PPC64_SECTOFF_LO_DS 62 /* half16ds #lo(R + A) >> 2 */ -+#define R_PPC64_TOC16_DS 63 /* half16ds* (S + A - .TOC.) >> 2 */ -+#define R_PPC64_TOC16_LO_DS 64 /* half16ds #lo(S + A - .TOC.) >> 2 */ -+#define R_PPC64_PLTGOT16_DS 65 /* half16ds* (M + A) >> 2 */ -+#define R_PPC64_PLTGOT16_LO_DS 66 /* half16ds #lo(M + A) >> 2 */ -+ -+/* PowerPC64 relocations defined for the TLS access ABI. */ -+#define R_PPC64_TLS 67 /* none (sym+add)@tls */ -+#define R_PPC64_DTPMOD64 68 /* doubleword64 (sym+add)@dtpmod */ -+#define R_PPC64_TPREL16 69 /* half16* (sym+add)@tprel */ -+#define R_PPC64_TPREL16_LO 70 /* half16 (sym+add)@tprel@l */ -+#define R_PPC64_TPREL16_HI 71 /* half16 (sym+add)@tprel@h */ -+#define R_PPC64_TPREL16_HA 72 /* half16 (sym+add)@tprel@ha */ -+#define R_PPC64_TPREL64 73 /* doubleword64 (sym+add)@tprel */ -+#define R_PPC64_DTPREL16 74 /* half16* (sym+add)@dtprel */ -+#define R_PPC64_DTPREL16_LO 75 /* half16 (sym+add)@dtprel@l */ -+#define R_PPC64_DTPREL16_HI 76 /* half16 (sym+add)@dtprel@h */ -+#define R_PPC64_DTPREL16_HA 77 /* half16 (sym+add)@dtprel@ha */ -+#define R_PPC64_DTPREL64 78 /* doubleword64 (sym+add)@dtprel */ -+#define R_PPC64_GOT_TLSGD16 79 /* half16* (sym+add)@got@tlsgd */ -+#define R_PPC64_GOT_TLSGD16_LO 80 /* half16 (sym+add)@got@tlsgd@l */ -+#define R_PPC64_GOT_TLSGD16_HI 81 /* half16 (sym+add)@got@tlsgd@h */ -+#define R_PPC64_GOT_TLSGD16_HA 82 /* half16 (sym+add)@got@tlsgd@ha */ -+#define R_PPC64_GOT_TLSLD16 83 /* half16* (sym+add)@got@tlsld */ -+#define R_PPC64_GOT_TLSLD16_LO 84 /* half16 (sym+add)@got@tlsld@l */ -+#define R_PPC64_GOT_TLSLD16_HI 85 /* half16 (sym+add)@got@tlsld@h */ -+#define R_PPC64_GOT_TLSLD16_HA 86 /* half16 (sym+add)@got@tlsld@ha */ -+#define R_PPC64_GOT_TPREL16_DS 87 /* half16ds* (sym+add)@got@tprel */ -+#define R_PPC64_GOT_TPREL16_LO_DS 88 /* half16ds (sym+add)@got@tprel@l */ -+#define R_PPC64_GOT_TPREL16_HI 89 /* half16 (sym+add)@got@tprel@h */ -+#define R_PPC64_GOT_TPREL16_HA 90 /* half16 (sym+add)@got@tprel@ha */ -+#define R_PPC64_GOT_DTPREL16_DS 91 /* half16ds* (sym+add)@got@dtprel */ -+#define R_PPC64_GOT_DTPREL16_LO_DS 92 /* half16ds (sym+add)@got@dtprel@l */ -+#define R_PPC64_GOT_DTPREL16_HI 93 /* half16 (sym+add)@got@dtprel@h */ -+#define R_PPC64_GOT_DTPREL16_HA 94 /* half16 (sym+add)@got@dtprel@ha */ -+#define R_PPC64_TPREL16_DS 95 /* half16ds* (sym+add)@tprel */ -+#define R_PPC64_TPREL16_LO_DS 96 /* half16ds (sym+add)@tprel@l */ -+#define R_PPC64_TPREL16_HIGHER 97 /* half16 (sym+add)@tprel@higher */ -+#define R_PPC64_TPREL16_HIGHERA 98 /* half16 (sym+add)@tprel@highera */ -+#define R_PPC64_TPREL16_HIGHEST 99 /* half16 (sym+add)@tprel@highest */ -+#define R_PPC64_TPREL16_HIGHESTA 100 /* half16 (sym+add)@tprel@highesta */ -+#define R_PPC64_DTPREL16_DS 101 /* half16ds* (sym+add)@dtprel */ -+#define R_PPC64_DTPREL16_LO_DS 102 /* half16ds (sym+add)@dtprel@l */ -+#define R_PPC64_DTPREL16_HIGHER 103 /* half16 (sym+add)@dtprel@higher */ -+#define R_PPC64_DTPREL16_HIGHERA 104 /* half16 (sym+add)@dtprel@highera */ -+#define R_PPC64_DTPREL16_HIGHEST 105 /* half16 (sym+add)@dtprel@highest */ -+#define R_PPC64_DTPREL16_HIGHESTA 106 /* half16 (sym+add)@dtprel@highesta */ -+ -+/* GNU extension to support local ifunc. */ -+#define R_PPC64_JMP_IREL 247 -+#define R_PPC64_IRELATIVE 248 -+#define R_PPC64_REL16 249 /* half16 (sym+add-.) */ -+#define R_PPC64_REL16_LO 250 /* half16 (sym+add-.)@l */ -+#define R_PPC64_REL16_HI 251 /* half16 (sym+add-.)@h */ -+#define R_PPC64_REL16_HA 252 /* half16 (sym+add-.)@ha */ -+ -+/* PowerPC64 specific values for the Dyn d_tag field. */ -+#define DT_PPC64_GLINK (DT_LOPROC + 0) -+#define DT_PPC64_OPD (DT_LOPROC + 1) -+#define DT_PPC64_OPDSZ (DT_LOPROC + 2) -+#define DT_PPC64_NUM 3 -+ -+ -+/* ARM specific declarations */ -+ -+/* Processor specific flags for the ELF header e_flags field. */ -+#define EF_ARM_RELEXEC 0x01 -+#define EF_ARM_HASENTRY 0x02 -+#define EF_ARM_INTERWORK 0x04 -+#define EF_ARM_APCS_26 0x08 -+#define EF_ARM_APCS_FLOAT 0x10 -+#define EF_ARM_PIC 0x20 -+#define EF_ARM_ALIGN8 0x40 /* 8-bit structure alignment is in use */ -+#define EF_ARM_NEW_ABI 0x80 -+#define EF_ARM_OLD_ABI 0x100 -+#define EF_ARM_SOFT_FLOAT 0x200 -+#define EF_ARM_VFP_FLOAT 0x400 -+#define EF_ARM_MAVERICK_FLOAT 0x800 -+ -+ -+/* Other constants defined in the ARM ELF spec. version B-01. */ -+/* NB. These conflict with values defined above. */ -+#define EF_ARM_SYMSARESORTED 0x04 -+#define EF_ARM_DYNSYMSUSESEGIDX 0x08 -+#define EF_ARM_MAPSYMSFIRST 0x10 -+#define EF_ARM_EABIMASK 0XFF000000 -+ -+/* Constants defined in AAELF. */ -+#define EF_ARM_BE8 0x00800000 -+#define EF_ARM_LE8 0x00400000 -+ -+#define EF_ARM_EABI_VERSION(flags) ((flags) & EF_ARM_EABIMASK) -+#define EF_ARM_EABI_UNKNOWN 0x00000000 -+#define EF_ARM_EABI_VER1 0x01000000 -+#define EF_ARM_EABI_VER2 0x02000000 -+#define EF_ARM_EABI_VER3 0x03000000 -+#define EF_ARM_EABI_VER4 0x04000000 -+#define EF_ARM_EABI_VER5 0x05000000 -+ -+/* Additional symbol types for Thumb. */ -+#define STT_ARM_TFUNC STT_LOPROC /* A Thumb function. */ -+#define STT_ARM_16BIT STT_HIPROC /* A Thumb label. */ -+ -+/* ARM-specific values for sh_flags */ -+#define SHF_ARM_ENTRYSECT 0x10000000 /* Section contains an entry point */ -+#define SHF_ARM_COMDEF 0x80000000 /* Section may be multiply defined -+ in the input to a link step. */ -+ -+/* ARM-specific program header flags */ -+#define PF_ARM_SB 0x10000000 /* Segment contains the location -+ addressed by the static base. */ -+#define PF_ARM_PI 0x20000000 /* Position-independent segment. */ -+#define PF_ARM_ABS 0x40000000 /* Absolute segment. */ -+ -+/* Processor specific values for the Phdr p_type field. */ -+#define PT_ARM_EXIDX (PT_LOPROC + 1) /* ARM unwind segment. */ -+ -+/* Processor specific values for the Shdr sh_type field. */ -+#define SHT_ARM_EXIDX (SHT_LOPROC + 1) /* ARM unwind section. */ -+#define SHT_ARM_PREEMPTMAP (SHT_LOPROC + 2) /* Preemption details. */ -+#define SHT_ARM_ATTRIBUTES (SHT_LOPROC + 3) /* ARM attributes section. */ -+ -+ -+/* ARM relocs. */ -+ -+#define R_ARM_NONE 0 /* No reloc */ -+#define R_ARM_PC24 1 /* PC relative 26 bit branch */ -+#define R_ARM_ABS32 2 /* Direct 32 bit */ -+#define R_ARM_REL32 3 /* PC relative 32 bit */ -+#define R_ARM_PC13 4 -+#define R_ARM_ABS16 5 /* Direct 16 bit */ -+#define R_ARM_ABS12 6 /* Direct 12 bit */ -+#define R_ARM_THM_ABS5 7 -+#define R_ARM_ABS8 8 /* Direct 8 bit */ -+#define R_ARM_SBREL32 9 -+#define R_ARM_THM_PC22 10 -+#define R_ARM_THM_PC8 11 -+#define R_ARM_AMP_VCALL9 12 -+#define R_ARM_SWI24 13 /* Obsolete static relocation. */ -+#define R_ARM_TLS_DESC 13 /* Dynamic relocation. */ -+#define R_ARM_THM_SWI8 14 -+#define R_ARM_XPC25 15 -+#define R_ARM_THM_XPC22 16 -+#define R_ARM_TLS_DTPMOD32 17 /* ID of module containing symbol */ -+#define R_ARM_TLS_DTPOFF32 18 /* Offset in TLS block */ -+#define R_ARM_TLS_TPOFF32 19 /* Offset in static TLS block */ -+#define R_ARM_COPY 20 /* Copy symbol at runtime */ -+#define R_ARM_GLOB_DAT 21 /* Create GOT entry */ -+#define R_ARM_JUMP_SLOT 22 /* Create PLT entry */ -+#define R_ARM_RELATIVE 23 /* Adjust by program base */ -+#define R_ARM_GOTOFF 24 /* 32 bit offset to GOT */ -+#define R_ARM_GOTPC 25 /* 32 bit PC relative offset to GOT */ -+#define R_ARM_GOT32 26 /* 32 bit GOT entry */ -+#define R_ARM_PLT32 27 /* 32 bit PLT address */ -+#define R_ARM_ALU_PCREL_7_0 32 -+#define R_ARM_ALU_PCREL_15_8 33 -+#define R_ARM_ALU_PCREL_23_15 34 -+#define R_ARM_LDR_SBREL_11_0 35 -+#define R_ARM_ALU_SBREL_19_12 36 -+#define R_ARM_ALU_SBREL_27_20 37 -+#define R_ARM_TLS_GOTDESC 90 -+#define R_ARM_TLS_CALL 91 -+#define R_ARM_TLS_DESCSEQ 92 -+#define R_ARM_THM_TLS_CALL 93 -+#define R_ARM_GNU_VTENTRY 100 -+#define R_ARM_GNU_VTINHERIT 101 -+#define R_ARM_THM_PC11 102 /* thumb unconditional branch */ -+#define R_ARM_THM_PC9 103 /* thumb conditional branch */ -+#define R_ARM_TLS_GD32 104 /* PC-rel 32 bit for global dynamic -+ thread local data */ -+#define R_ARM_TLS_LDM32 105 /* PC-rel 32 bit for local dynamic -+ thread local data */ -+#define R_ARM_TLS_LDO32 106 /* 32 bit offset relative to TLS -+ block */ -+#define R_ARM_TLS_IE32 107 /* PC-rel 32 bit for GOT entry of -+ static TLS block offset */ -+#define R_ARM_TLS_LE32 108 /* 32 bit offset relative to static -+ TLS block */ -+#define R_ARM_THM_TLS_DESCSEQ 129 -+#define R_ARM_IRELATIVE 160 -+#define R_ARM_RXPC25 249 -+#define R_ARM_RSBREL32 250 -+#define R_ARM_THM_RPC22 251 -+#define R_ARM_RREL32 252 -+#define R_ARM_RABS22 253 -+#define R_ARM_RPC24 254 -+#define R_ARM_RBASE 255 -+/* Keep this the last entry. */ -+#define R_ARM_NUM 256 -+ -+/* IA-64 specific declarations. */ -+ -+/* Processor specific flags for the Ehdr e_flags field. */ -+#define EF_IA_64_MASKOS 0x0000000f /* os-specific flags */ -+#define EF_IA_64_ABI64 0x00000010 /* 64-bit ABI */ -+#define EF_IA_64_ARCH 0xff000000 /* arch. version mask */ -+ -+/* Processor specific values for the Phdr p_type field. */ -+#define PT_IA_64_ARCHEXT (PT_LOPROC + 0) /* arch extension bits */ -+#define PT_IA_64_UNWIND (PT_LOPROC + 1) /* ia64 unwind bits */ -+#define PT_IA_64_HP_OPT_ANOT (PT_LOOS + 0x12) -+#define PT_IA_64_HP_HSL_ANOT (PT_LOOS + 0x13) -+#define PT_IA_64_HP_STACK (PT_LOOS + 0x14) -+ -+/* Processor specific flags for the Phdr p_flags field. */ -+#define PF_IA_64_NORECOV 0x80000000 /* spec insns w/o recovery */ -+ -+/* Processor specific values for the Shdr sh_type field. */ -+#define SHT_IA_64_EXT (SHT_LOPROC + 0) /* extension bits */ -+#define SHT_IA_64_UNWIND (SHT_LOPROC + 1) /* unwind bits */ -+ -+/* Processor specific flags for the Shdr sh_flags field. */ -+#define SHF_IA_64_SHORT 0x10000000 /* section near gp */ -+#define SHF_IA_64_NORECOV 0x20000000 /* spec insns w/o recovery */ -+ -+/* Processor specific values for the Dyn d_tag field. */ -+#define DT_IA_64_PLT_RESERVE (DT_LOPROC + 0) -+#define DT_IA_64_NUM 1 -+ -+/* IA-64 relocations. */ -+#define R_IA64_NONE 0x00 /* none */ -+#define R_IA64_IMM14 0x21 /* symbol + addend, add imm14 */ -+#define R_IA64_IMM22 0x22 /* symbol + addend, add imm22 */ -+#define R_IA64_IMM64 0x23 /* symbol + addend, mov imm64 */ -+#define R_IA64_DIR32MSB 0x24 /* symbol + addend, data4 MSB */ -+#define R_IA64_DIR32LSB 0x25 /* symbol + addend, data4 LSB */ -+#define R_IA64_DIR64MSB 0x26 /* symbol + addend, data8 MSB */ -+#define R_IA64_DIR64LSB 0x27 /* symbol + addend, data8 LSB */ -+#define R_IA64_GPREL22 0x2a /* @gprel(sym + add), add imm22 */ -+#define R_IA64_GPREL64I 0x2b /* @gprel(sym + add), mov imm64 */ -+#define R_IA64_GPREL32MSB 0x2c /* @gprel(sym + add), data4 MSB */ -+#define R_IA64_GPREL32LSB 0x2d /* @gprel(sym + add), data4 LSB */ -+#define R_IA64_GPREL64MSB 0x2e /* @gprel(sym + add), data8 MSB */ -+#define R_IA64_GPREL64LSB 0x2f /* @gprel(sym + add), data8 LSB */ -+#define R_IA64_LTOFF22 0x32 /* @ltoff(sym + add), add imm22 */ -+#define R_IA64_LTOFF64I 0x33 /* @ltoff(sym + add), mov imm64 */ -+#define R_IA64_PLTOFF22 0x3a /* @pltoff(sym + add), add imm22 */ -+#define R_IA64_PLTOFF64I 0x3b /* @pltoff(sym + add), mov imm64 */ -+#define R_IA64_PLTOFF64MSB 0x3e /* @pltoff(sym + add), data8 MSB */ -+#define R_IA64_PLTOFF64LSB 0x3f /* @pltoff(sym + add), data8 LSB */ -+#define R_IA64_FPTR64I 0x43 /* @fptr(sym + add), mov imm64 */ -+#define R_IA64_FPTR32MSB 0x44 /* @fptr(sym + add), data4 MSB */ -+#define R_IA64_FPTR32LSB 0x45 /* @fptr(sym + add), data4 LSB */ -+#define R_IA64_FPTR64MSB 0x46 /* @fptr(sym + add), data8 MSB */ -+#define R_IA64_FPTR64LSB 0x47 /* @fptr(sym + add), data8 LSB */ -+#define R_IA64_PCREL60B 0x48 /* @pcrel(sym + add), brl */ -+#define R_IA64_PCREL21B 0x49 /* @pcrel(sym + add), ptb, call */ -+#define R_IA64_PCREL21M 0x4a /* @pcrel(sym + add), chk.s */ -+#define R_IA64_PCREL21F 0x4b /* @pcrel(sym + add), fchkf */ -+#define R_IA64_PCREL32MSB 0x4c /* @pcrel(sym + add), data4 MSB */ -+#define R_IA64_PCREL32LSB 0x4d /* @pcrel(sym + add), data4 LSB */ -+#define R_IA64_PCREL64MSB 0x4e /* @pcrel(sym + add), data8 MSB */ -+#define R_IA64_PCREL64LSB 0x4f /* @pcrel(sym + add), data8 LSB */ -+#define R_IA64_LTOFF_FPTR22 0x52 /* @ltoff(@fptr(s+a)), imm22 */ -+#define R_IA64_LTOFF_FPTR64I 0x53 /* @ltoff(@fptr(s+a)), imm64 */ -+#define R_IA64_LTOFF_FPTR32MSB 0x54 /* @ltoff(@fptr(s+a)), data4 MSB */ -+#define R_IA64_LTOFF_FPTR32LSB 0x55 /* @ltoff(@fptr(s+a)), data4 LSB */ -+#define R_IA64_LTOFF_FPTR64MSB 0x56 /* @ltoff(@fptr(s+a)), data8 MSB */ -+#define R_IA64_LTOFF_FPTR64LSB 0x57 /* @ltoff(@fptr(s+a)), data8 LSB */ -+#define R_IA64_SEGREL32MSB 0x5c /* @segrel(sym + add), data4 MSB */ -+#define R_IA64_SEGREL32LSB 0x5d /* @segrel(sym + add), data4 LSB */ -+#define R_IA64_SEGREL64MSB 0x5e /* @segrel(sym + add), data8 MSB */ -+#define R_IA64_SEGREL64LSB 0x5f /* @segrel(sym + add), data8 LSB */ -+#define R_IA64_SECREL32MSB 0x64 /* @secrel(sym + add), data4 MSB */ -+#define R_IA64_SECREL32LSB 0x65 /* @secrel(sym + add), data4 LSB */ -+#define R_IA64_SECREL64MSB 0x66 /* @secrel(sym + add), data8 MSB */ -+#define R_IA64_SECREL64LSB 0x67 /* @secrel(sym + add), data8 LSB */ -+#define R_IA64_REL32MSB 0x6c /* data 4 + REL */ -+#define R_IA64_REL32LSB 0x6d /* data 4 + REL */ -+#define R_IA64_REL64MSB 0x6e /* data 8 + REL */ -+#define R_IA64_REL64LSB 0x6f /* data 8 + REL */ -+#define R_IA64_LTV32MSB 0x74 /* symbol + addend, data4 MSB */ -+#define R_IA64_LTV32LSB 0x75 /* symbol + addend, data4 LSB */ -+#define R_IA64_LTV64MSB 0x76 /* symbol + addend, data8 MSB */ -+#define R_IA64_LTV64LSB 0x77 /* symbol + addend, data8 LSB */ -+#define R_IA64_PCREL21BI 0x79 /* @pcrel(sym + add), 21bit inst */ -+#define R_IA64_PCREL22 0x7a /* @pcrel(sym + add), 22bit inst */ -+#define R_IA64_PCREL64I 0x7b /* @pcrel(sym + add), 64bit inst */ -+#define R_IA64_IPLTMSB 0x80 /* dynamic reloc, imported PLT, MSB */ -+#define R_IA64_IPLTLSB 0x81 /* dynamic reloc, imported PLT, LSB */ -+#define R_IA64_COPY 0x84 /* copy relocation */ -+#define R_IA64_SUB 0x85 /* Addend and symbol difference */ -+#define R_IA64_LTOFF22X 0x86 /* LTOFF22, relaxable. */ -+#define R_IA64_LDXMOV 0x87 /* Use of LTOFF22X. */ -+#define R_IA64_TPREL14 0x91 /* @tprel(sym + add), imm14 */ -+#define R_IA64_TPREL22 0x92 /* @tprel(sym + add), imm22 */ -+#define R_IA64_TPREL64I 0x93 /* @tprel(sym + add), imm64 */ -+#define R_IA64_TPREL64MSB 0x96 /* @tprel(sym + add), data8 MSB */ -+#define R_IA64_TPREL64LSB 0x97 /* @tprel(sym + add), data8 LSB */ -+#define R_IA64_LTOFF_TPREL22 0x9a /* @ltoff(@tprel(s+a)), imm2 */ -+#define R_IA64_DTPMOD64MSB 0xa6 /* @dtpmod(sym + add), data8 MSB */ -+#define R_IA64_DTPMOD64LSB 0xa7 /* @dtpmod(sym + add), data8 LSB */ -+#define R_IA64_LTOFF_DTPMOD22 0xaa /* @ltoff(@dtpmod(sym + add)), imm22 */ -+#define R_IA64_DTPREL14 0xb1 /* @dtprel(sym + add), imm14 */ -+#define R_IA64_DTPREL22 0xb2 /* @dtprel(sym + add), imm22 */ -+#define R_IA64_DTPREL64I 0xb3 /* @dtprel(sym + add), imm64 */ -+#define R_IA64_DTPREL32MSB 0xb4 /* @dtprel(sym + add), data4 MSB */ -+#define R_IA64_DTPREL32LSB 0xb5 /* @dtprel(sym + add), data4 LSB */ -+#define R_IA64_DTPREL64MSB 0xb6 /* @dtprel(sym + add), data8 MSB */ -+#define R_IA64_DTPREL64LSB 0xb7 /* @dtprel(sym + add), data8 LSB */ -+#define R_IA64_LTOFF_DTPREL22 0xba /* @ltoff(@dtprel(s+a)), imm22 */ -+ -+/* SH specific declarations */ -+ -+/* Processor specific flags for the ELF header e_flags field. */ -+#define EF_SH_MACH_MASK 0x1f -+#define EF_SH_UNKNOWN 0x0 -+#define EF_SH1 0x1 -+#define EF_SH2 0x2 -+#define EF_SH3 0x3 -+#define EF_SH_DSP 0x4 -+#define EF_SH3_DSP 0x5 -+#define EF_SH4AL_DSP 0x6 -+#define EF_SH3E 0x8 -+#define EF_SH4 0x9 -+#define EF_SH2E 0xb -+#define EF_SH4A 0xc -+#define EF_SH2A 0xd -+#define EF_SH4_NOFPU 0x10 -+#define EF_SH4A_NOFPU 0x11 -+#define EF_SH4_NOMMU_NOFPU 0x12 -+#define EF_SH2A_NOFPU 0x13 -+#define EF_SH3_NOMMU 0x14 -+#define EF_SH2A_SH4_NOFPU 0x15 -+#define EF_SH2A_SH3_NOFPU 0x16 -+#define EF_SH2A_SH4 0x17 -+#define EF_SH2A_SH3E 0x18 -+ -+/* SH relocs. */ -+#define R_SH_NONE 0 -+#define R_SH_DIR32 1 -+#define R_SH_REL32 2 -+#define R_SH_DIR8WPN 3 -+#define R_SH_IND12W 4 -+#define R_SH_DIR8WPL 5 -+#define R_SH_DIR8WPZ 6 -+#define R_SH_DIR8BP 7 -+#define R_SH_DIR8W 8 -+#define R_SH_DIR8L 9 -+#define R_SH_SWITCH16 25 -+#define R_SH_SWITCH32 26 -+#define R_SH_USES 27 -+#define R_SH_COUNT 28 -+#define R_SH_ALIGN 29 -+#define R_SH_CODE 30 -+#define R_SH_DATA 31 -+#define R_SH_LABEL 32 -+#define R_SH_SWITCH8 33 -+#define R_SH_GNU_VTINHERIT 34 -+#define R_SH_GNU_VTENTRY 35 -+#define R_SH_TLS_GD_32 144 -+#define R_SH_TLS_LD_32 145 -+#define R_SH_TLS_LDO_32 146 -+#define R_SH_TLS_IE_32 147 -+#define R_SH_TLS_LE_32 148 -+#define R_SH_TLS_DTPMOD32 149 -+#define R_SH_TLS_DTPOFF32 150 -+#define R_SH_TLS_TPOFF32 151 -+#define R_SH_GOT32 160 -+#define R_SH_PLT32 161 -+#define R_SH_COPY 162 -+#define R_SH_GLOB_DAT 163 -+#define R_SH_JMP_SLOT 164 -+#define R_SH_RELATIVE 165 -+#define R_SH_GOTOFF 166 -+#define R_SH_GOTPC 167 -+/* Keep this the last entry. */ -+#define R_SH_NUM 256 -+ -+/* S/390 specific definitions. */ -+ -+/* Valid values for the e_flags field. */ -+ -+#define EF_S390_HIGH_GPRS 0x00000001 /* High GPRs kernel facility needed. */ -+ -+/* Additional s390 relocs */ -+ -+#define R_390_NONE 0 /* No reloc. */ -+#define R_390_8 1 /* Direct 8 bit. */ -+#define R_390_12 2 /* Direct 12 bit. */ -+#define R_390_16 3 /* Direct 16 bit. */ -+#define R_390_32 4 /* Direct 32 bit. */ -+#define R_390_PC32 5 /* PC relative 32 bit. */ -+#define R_390_GOT12 6 /* 12 bit GOT offset. */ -+#define R_390_GOT32 7 /* 32 bit GOT offset. */ -+#define R_390_PLT32 8 /* 32 bit PC relative PLT address. */ -+#define R_390_COPY 9 /* Copy symbol at runtime. */ -+#define R_390_GLOB_DAT 10 /* Create GOT entry. */ -+#define R_390_JMP_SLOT 11 /* Create PLT entry. */ -+#define R_390_RELATIVE 12 /* Adjust by program base. */ -+#define R_390_GOTOFF32 13 /* 32 bit offset to GOT. */ -+#define R_390_GOTPC 14 /* 32 bit PC relative offset to GOT. */ -+#define R_390_GOT16 15 /* 16 bit GOT offset. */ -+#define R_390_PC16 16 /* PC relative 16 bit. */ -+#define R_390_PC16DBL 17 /* PC relative 16 bit shifted by 1. */ -+#define R_390_PLT16DBL 18 /* 16 bit PC rel. PLT shifted by 1. */ -+#define R_390_PC32DBL 19 /* PC relative 32 bit shifted by 1. */ -+#define R_390_PLT32DBL 20 /* 32 bit PC rel. PLT shifted by 1. */ -+#define R_390_GOTPCDBL 21 /* 32 bit PC rel. GOT shifted by 1. */ -+#define R_390_64 22 /* Direct 64 bit. */ -+#define R_390_PC64 23 /* PC relative 64 bit. */ -+#define R_390_GOT64 24 /* 64 bit GOT offset. */ -+#define R_390_PLT64 25 /* 64 bit PC relative PLT address. */ -+#define R_390_GOTENT 26 /* 32 bit PC rel. to GOT entry >> 1. */ -+#define R_390_GOTOFF16 27 /* 16 bit offset to GOT. */ -+#define R_390_GOTOFF64 28 /* 64 bit offset to GOT. */ -+#define R_390_GOTPLT12 29 /* 12 bit offset to jump slot. */ -+#define R_390_GOTPLT16 30 /* 16 bit offset to jump slot. */ -+#define R_390_GOTPLT32 31 /* 32 bit offset to jump slot. */ -+#define R_390_GOTPLT64 32 /* 64 bit offset to jump slot. */ -+#define R_390_GOTPLTENT 33 /* 32 bit rel. offset to jump slot. */ -+#define R_390_PLTOFF16 34 /* 16 bit offset from GOT to PLT. */ -+#define R_390_PLTOFF32 35 /* 32 bit offset from GOT to PLT. */ -+#define R_390_PLTOFF64 36 /* 16 bit offset from GOT to PLT. */ -+#define R_390_TLS_LOAD 37 /* Tag for load insn in TLS code. */ -+#define R_390_TLS_GDCALL 38 /* Tag for function call in general -+ dynamic TLS code. */ -+#define R_390_TLS_LDCALL 39 /* Tag for function call in local -+ dynamic TLS code. */ -+#define R_390_TLS_GD32 40 /* Direct 32 bit for general dynamic -+ thread local data. */ -+#define R_390_TLS_GD64 41 /* Direct 64 bit for general dynamic -+ thread local data. */ -+#define R_390_TLS_GOTIE12 42 /* 12 bit GOT offset for static TLS -+ block offset. */ -+#define R_390_TLS_GOTIE32 43 /* 32 bit GOT offset for static TLS -+ block offset. */ -+#define R_390_TLS_GOTIE64 44 /* 64 bit GOT offset for static TLS -+ block offset. */ -+#define R_390_TLS_LDM32 45 /* Direct 32 bit for local dynamic -+ thread local data in LE code. */ -+#define R_390_TLS_LDM64 46 /* Direct 64 bit for local dynamic -+ thread local data in LE code. */ -+#define R_390_TLS_IE32 47 /* 32 bit address of GOT entry for -+ negated static TLS block offset. */ -+#define R_390_TLS_IE64 48 /* 64 bit address of GOT entry for -+ negated static TLS block offset. */ -+#define R_390_TLS_IEENT 49 /* 32 bit rel. offset to GOT entry for -+ negated static TLS block offset. */ -+#define R_390_TLS_LE32 50 /* 32 bit negated offset relative to -+ static TLS block. */ -+#define R_390_TLS_LE64 51 /* 64 bit negated offset relative to -+ static TLS block. */ -+#define R_390_TLS_LDO32 52 /* 32 bit offset relative to TLS -+ block. */ -+#define R_390_TLS_LDO64 53 /* 64 bit offset relative to TLS -+ block. */ -+#define R_390_TLS_DTPMOD 54 /* ID of module containing symbol. */ -+#define R_390_TLS_DTPOFF 55 /* Offset in TLS block. */ -+#define R_390_TLS_TPOFF 56 /* Negated offset in static TLS -+ block. */ -+#define R_390_20 57 /* Direct 20 bit. */ -+#define R_390_GOT20 58 /* 20 bit GOT offset. */ -+#define R_390_GOTPLT20 59 /* 20 bit offset to jump slot. */ -+#define R_390_TLS_GOTIE20 60 /* 20 bit GOT offset for static TLS -+ block offset. */ -+#define R_390_IRELATIVE 61 /* STT_GNU_IFUNC relocation. */ -+/* Keep this the last entry. */ -+#define R_390_NUM 62 -+ -+ -+/* CRIS relocations. */ -+#define R_CRIS_NONE 0 -+#define R_CRIS_8 1 -+#define R_CRIS_16 2 -+#define R_CRIS_32 3 -+#define R_CRIS_8_PCREL 4 -+#define R_CRIS_16_PCREL 5 -+#define R_CRIS_32_PCREL 6 -+#define R_CRIS_GNU_VTINHERIT 7 -+#define R_CRIS_GNU_VTENTRY 8 -+#define R_CRIS_COPY 9 -+#define R_CRIS_GLOB_DAT 10 -+#define R_CRIS_JUMP_SLOT 11 -+#define R_CRIS_RELATIVE 12 -+#define R_CRIS_16_GOT 13 -+#define R_CRIS_32_GOT 14 -+#define R_CRIS_16_GOTPLT 15 -+#define R_CRIS_32_GOTPLT 16 -+#define R_CRIS_32_GOTREL 17 -+#define R_CRIS_32_PLT_GOTREL 18 -+#define R_CRIS_32_PLT_PCREL 19 -+ -+#define R_CRIS_NUM 20 -+ -+ -+/* AMD x86-64 relocations. */ -+#define R_X86_64_NONE 0 /* No reloc */ -+#define R_X86_64_64 1 /* Direct 64 bit */ -+#define R_X86_64_PC32 2 /* PC relative 32 bit signed */ -+#define R_X86_64_GOT32 3 /* 32 bit GOT entry */ -+#define R_X86_64_PLT32 4 /* 32 bit PLT address */ -+#define R_X86_64_COPY 5 /* Copy symbol at runtime */ -+#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ -+#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ -+#define R_X86_64_RELATIVE 8 /* Adjust by program base */ -+#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative -+ offset to GOT */ -+#define R_X86_64_32 10 /* Direct 32 bit zero extended */ -+#define R_X86_64_32S 11 /* Direct 32 bit sign extended */ -+#define R_X86_64_16 12 /* Direct 16 bit zero extended */ -+#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ -+#define R_X86_64_8 14 /* Direct 8 bit sign extended */ -+#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ -+#define R_X86_64_DTPMOD64 16 /* ID of module containing symbol */ -+#define R_X86_64_DTPOFF64 17 /* Offset in module's TLS block */ -+#define R_X86_64_TPOFF64 18 /* Offset in initial TLS block */ -+#define R_X86_64_TLSGD 19 /* 32 bit signed PC relative offset -+ to two GOT entries for GD symbol */ -+#define R_X86_64_TLSLD 20 /* 32 bit signed PC relative offset -+ to two GOT entries for LD symbol */ -+#define R_X86_64_DTPOFF32 21 /* Offset in TLS block */ -+#define R_X86_64_GOTTPOFF 22 /* 32 bit signed PC relative offset -+ to GOT entry for IE symbol */ -+#define R_X86_64_TPOFF32 23 /* Offset in initial TLS block */ -+#define R_X86_64_PC64 24 /* PC relative 64 bit */ -+#define R_X86_64_GOTOFF64 25 /* 64 bit offset to GOT */ -+#define R_X86_64_GOTPC32 26 /* 32 bit signed pc relative -+ offset to GOT */ -+#define R_X86_64_GOT64 27 /* 64-bit GOT entry offset */ -+#define R_X86_64_GOTPCREL64 28 /* 64-bit PC relative offset -+ to GOT entry */ -+#define R_X86_64_GOTPC64 29 /* 64-bit PC relative offset to GOT */ -+#define R_X86_64_GOTPLT64 30 /* like GOT64, says PLT entry needed */ -+#define R_X86_64_PLTOFF64 31 /* 64-bit GOT relative offset -+ to PLT entry */ -+#define R_X86_64_SIZE32 32 /* Size of symbol plus 32-bit addend */ -+#define R_X86_64_SIZE64 33 /* Size of symbol plus 64-bit addend */ -+#define R_X86_64_GOTPC32_TLSDESC 34 /* GOT offset for TLS descriptor. */ -+#define R_X86_64_TLSDESC_CALL 35 /* Marker for call through TLS -+ descriptor. */ -+#define R_X86_64_TLSDESC 36 /* TLS descriptor. */ -+#define R_X86_64_IRELATIVE 37 /* Adjust indirectly by program base */ -+#define R_X86_64_RELATIVE64 38 /* 64-bit adjust by program base */ -+ -+#define R_X86_64_NUM 39 -+ -+ -+/* AM33 relocations. */ -+#define R_MN10300_NONE 0 /* No reloc. */ -+#define R_MN10300_32 1 /* Direct 32 bit. */ -+#define R_MN10300_16 2 /* Direct 16 bit. */ -+#define R_MN10300_8 3 /* Direct 8 bit. */ -+#define R_MN10300_PCREL32 4 /* PC-relative 32-bit. */ -+#define R_MN10300_PCREL16 5 /* PC-relative 16-bit signed. */ -+#define R_MN10300_PCREL8 6 /* PC-relative 8-bit signed. */ -+#define R_MN10300_GNU_VTINHERIT 7 /* Ancient C++ vtable garbage... */ -+#define R_MN10300_GNU_VTENTRY 8 /* ... collection annotation. */ -+#define R_MN10300_24 9 /* Direct 24 bit. */ -+#define R_MN10300_GOTPC32 10 /* 32-bit PCrel offset to GOT. */ -+#define R_MN10300_GOTPC16 11 /* 16-bit PCrel offset to GOT. */ -+#define R_MN10300_GOTOFF32 12 /* 32-bit offset from GOT. */ -+#define R_MN10300_GOTOFF24 13 /* 24-bit offset from GOT. */ -+#define R_MN10300_GOTOFF16 14 /* 16-bit offset from GOT. */ -+#define R_MN10300_PLT32 15 /* 32-bit PCrel to PLT entry. */ -+#define R_MN10300_PLT16 16 /* 16-bit PCrel to PLT entry. */ -+#define R_MN10300_GOT32 17 /* 32-bit offset to GOT entry. */ -+#define R_MN10300_GOT24 18 /* 24-bit offset to GOT entry. */ -+#define R_MN10300_GOT16 19 /* 16-bit offset to GOT entry. */ -+#define R_MN10300_COPY 20 /* Copy symbol at runtime. */ -+#define R_MN10300_GLOB_DAT 21 /* Create GOT entry. */ -+#define R_MN10300_JMP_SLOT 22 /* Create PLT entry. */ -+#define R_MN10300_RELATIVE 23 /* Adjust by program base. */ -+ -+#define R_MN10300_NUM 24 -+ -+ -+/* M32R relocs. */ -+#define R_M32R_NONE 0 /* No reloc. */ -+#define R_M32R_16 1 /* Direct 16 bit. */ -+#define R_M32R_32 2 /* Direct 32 bit. */ -+#define R_M32R_24 3 /* Direct 24 bit. */ -+#define R_M32R_10_PCREL 4 /* PC relative 10 bit shifted. */ -+#define R_M32R_18_PCREL 5 /* PC relative 18 bit shifted. */ -+#define R_M32R_26_PCREL 6 /* PC relative 26 bit shifted. */ -+#define R_M32R_HI16_ULO 7 /* High 16 bit with unsigned low. */ -+#define R_M32R_HI16_SLO 8 /* High 16 bit with signed low. */ -+#define R_M32R_LO16 9 /* Low 16 bit. */ -+#define R_M32R_SDA16 10 /* 16 bit offset in SDA. */ -+#define R_M32R_GNU_VTINHERIT 11 -+#define R_M32R_GNU_VTENTRY 12 -+/* M32R relocs use SHT_RELA. */ -+#define R_M32R_16_RELA 33 /* Direct 16 bit. */ -+#define R_M32R_32_RELA 34 /* Direct 32 bit. */ -+#define R_M32R_24_RELA 35 /* Direct 24 bit. */ -+#define R_M32R_10_PCREL_RELA 36 /* PC relative 10 bit shifted. */ -+#define R_M32R_18_PCREL_RELA 37 /* PC relative 18 bit shifted. */ -+#define R_M32R_26_PCREL_RELA 38 /* PC relative 26 bit shifted. */ -+#define R_M32R_HI16_ULO_RELA 39 /* High 16 bit with unsigned low */ -+#define R_M32R_HI16_SLO_RELA 40 /* High 16 bit with signed low */ -+#define R_M32R_LO16_RELA 41 /* Low 16 bit */ -+#define R_M32R_SDA16_RELA 42 /* 16 bit offset in SDA */ -+#define R_M32R_RELA_GNU_VTINHERIT 43 -+#define R_M32R_RELA_GNU_VTENTRY 44 -+#define R_M32R_REL32 45 /* PC relative 32 bit. */ -+ -+#define R_M32R_GOT24 48 /* 24 bit GOT entry */ -+#define R_M32R_26_PLTREL 49 /* 26 bit PC relative to PLT shifted */ -+#define R_M32R_COPY 50 /* Copy symbol at runtime */ -+#define R_M32R_GLOB_DAT 51 /* Create GOT entry */ -+#define R_M32R_JMP_SLOT 52 /* Create PLT entry */ -+#define R_M32R_RELATIVE 53 /* Adjust by program base */ -+#define R_M32R_GOTOFF 54 /* 24 bit offset to GOT */ -+#define R_M32R_GOTPC24 55 /* 24 bit PC relative offset to GOT */ -+#define R_M32R_GOT16_HI_ULO 56 /* High 16 bit GOT entry with unsigned -+ low */ -+#define R_M32R_GOT16_HI_SLO 57 /* High 16 bit GOT entry with signed -+ low */ -+#define R_M32R_GOT16_LO 58 /* Low 16 bit GOT entry */ -+#define R_M32R_GOTPC_HI_ULO 59 /* High 16 bit PC relative offset to -+ GOT with unsigned low */ -+#define R_M32R_GOTPC_HI_SLO 60 /* High 16 bit PC relative offset to -+ GOT with signed low */ -+#define R_M32R_GOTPC_LO 61 /* Low 16 bit PC relative offset to -+ GOT */ -+#define R_M32R_GOTOFF_HI_ULO 62 /* High 16 bit offset to GOT -+ with unsigned low */ -+#define R_M32R_GOTOFF_HI_SLO 63 /* High 16 bit offset to GOT -+ with signed low */ -+#define R_M32R_GOTOFF_LO 64 /* Low 16 bit offset to GOT */ -+#define R_M32R_NUM 256 /* Keep this the last entry. */ -+ -+ -+/* TILEPro relocations. */ -+#define R_TILEPRO_NONE 0 /* No reloc */ -+#define R_TILEPRO_32 1 /* Direct 32 bit */ -+#define R_TILEPRO_16 2 /* Direct 16 bit */ -+#define R_TILEPRO_8 3 /* Direct 8 bit */ -+#define R_TILEPRO_32_PCREL 4 /* PC relative 32 bit */ -+#define R_TILEPRO_16_PCREL 5 /* PC relative 16 bit */ -+#define R_TILEPRO_8_PCREL 6 /* PC relative 8 bit */ -+#define R_TILEPRO_LO16 7 /* Low 16 bit */ -+#define R_TILEPRO_HI16 8 /* High 16 bit */ -+#define R_TILEPRO_HA16 9 /* High 16 bit, adjusted */ -+#define R_TILEPRO_COPY 10 /* Copy relocation */ -+#define R_TILEPRO_GLOB_DAT 11 /* Create GOT entry */ -+#define R_TILEPRO_JMP_SLOT 12 /* Create PLT entry */ -+#define R_TILEPRO_RELATIVE 13 /* Adjust by program base */ -+#define R_TILEPRO_BROFF_X1 14 /* X1 pipe branch offset */ -+#define R_TILEPRO_JOFFLONG_X1 15 /* X1 pipe jump offset */ -+#define R_TILEPRO_JOFFLONG_X1_PLT 16 /* X1 pipe jump offset to PLT */ -+#define R_TILEPRO_IMM8_X0 17 /* X0 pipe 8-bit */ -+#define R_TILEPRO_IMM8_Y0 18 /* Y0 pipe 8-bit */ -+#define R_TILEPRO_IMM8_X1 19 /* X1 pipe 8-bit */ -+#define R_TILEPRO_IMM8_Y1 20 /* Y1 pipe 8-bit */ -+#define R_TILEPRO_MT_IMM15_X1 21 /* X1 pipe mtspr */ -+#define R_TILEPRO_MF_IMM15_X1 22 /* X1 pipe mfspr */ -+#define R_TILEPRO_IMM16_X0 23 /* X0 pipe 16-bit */ -+#define R_TILEPRO_IMM16_X1 24 /* X1 pipe 16-bit */ -+#define R_TILEPRO_IMM16_X0_LO 25 /* X0 pipe low 16-bit */ -+#define R_TILEPRO_IMM16_X1_LO 26 /* X1 pipe low 16-bit */ -+#define R_TILEPRO_IMM16_X0_HI 27 /* X0 pipe high 16-bit */ -+#define R_TILEPRO_IMM16_X1_HI 28 /* X1 pipe high 16-bit */ -+#define R_TILEPRO_IMM16_X0_HA 29 /* X0 pipe high 16-bit, adjusted */ -+#define R_TILEPRO_IMM16_X1_HA 30 /* X1 pipe high 16-bit, adjusted */ -+#define R_TILEPRO_IMM16_X0_PCREL 31 /* X0 pipe PC relative 16 bit */ -+#define R_TILEPRO_IMM16_X1_PCREL 32 /* X1 pipe PC relative 16 bit */ -+#define R_TILEPRO_IMM16_X0_LO_PCREL 33 /* X0 pipe PC relative low 16 bit */ -+#define R_TILEPRO_IMM16_X1_LO_PCREL 34 /* X1 pipe PC relative low 16 bit */ -+#define R_TILEPRO_IMM16_X0_HI_PCREL 35 /* X0 pipe PC relative high 16 bit */ -+#define R_TILEPRO_IMM16_X1_HI_PCREL 36 /* X1 pipe PC relative high 16 bit */ -+#define R_TILEPRO_IMM16_X0_HA_PCREL 37 /* X0 pipe PC relative ha() 16 bit */ -+#define R_TILEPRO_IMM16_X1_HA_PCREL 38 /* X1 pipe PC relative ha() 16 bit */ -+#define R_TILEPRO_IMM16_X0_GOT 39 /* X0 pipe 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X1_GOT 40 /* X1 pipe 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X0_GOT_LO 41 /* X0 pipe low 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X1_GOT_LO 42 /* X1 pipe low 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X0_GOT_HI 43 /* X0 pipe high 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X1_GOT_HI 44 /* X1 pipe high 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X0_GOT_HA 45 /* X0 pipe ha() 16-bit GOT offset */ -+#define R_TILEPRO_IMM16_X1_GOT_HA 46 /* X1 pipe ha() 16-bit GOT offset */ -+#define R_TILEPRO_MMSTART_X0 47 /* X0 pipe mm "start" */ -+#define R_TILEPRO_MMEND_X0 48 /* X0 pipe mm "end" */ -+#define R_TILEPRO_MMSTART_X1 49 /* X1 pipe mm "start" */ -+#define R_TILEPRO_MMEND_X1 50 /* X1 pipe mm "end" */ -+#define R_TILEPRO_SHAMT_X0 51 /* X0 pipe shift amount */ -+#define R_TILEPRO_SHAMT_X1 52 /* X1 pipe shift amount */ -+#define R_TILEPRO_SHAMT_Y0 53 /* Y0 pipe shift amount */ -+#define R_TILEPRO_SHAMT_Y1 54 /* Y1 pipe shift amount */ -+#define R_TILEPRO_DEST_IMM8_X1 55 /* X1 pipe destination 8-bit */ -+/* Relocs 56-59 are currently not defined. */ -+#define R_TILEPRO_TLS_GD_CALL 60 /* "jal" for TLS GD */ -+#define R_TILEPRO_IMM8_X0_TLS_GD_ADD 61 /* X0 pipe "addi" for TLS GD */ -+#define R_TILEPRO_IMM8_X1_TLS_GD_ADD 62 /* X1 pipe "addi" for TLS GD */ -+#define R_TILEPRO_IMM8_Y0_TLS_GD_ADD 63 /* Y0 pipe "addi" for TLS GD */ -+#define R_TILEPRO_IMM8_Y1_TLS_GD_ADD 64 /* Y1 pipe "addi" for TLS GD */ -+#define R_TILEPRO_TLS_IE_LOAD 65 /* "lw_tls" for TLS IE */ -+#define R_TILEPRO_IMM16_X0_TLS_GD 66 /* X0 pipe 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X1_TLS_GD 67 /* X1 pipe 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X0_TLS_GD_LO 68 /* X0 pipe low 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X1_TLS_GD_LO 69 /* X1 pipe low 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X0_TLS_GD_HI 70 /* X0 pipe high 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X1_TLS_GD_HI 71 /* X1 pipe high 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X0_TLS_GD_HA 72 /* X0 pipe ha() 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X1_TLS_GD_HA 73 /* X1 pipe ha() 16-bit TLS GD offset */ -+#define R_TILEPRO_IMM16_X0_TLS_IE 74 /* X0 pipe 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_IE 75 /* X1 pipe 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X0_TLS_IE_LO 76 /* X0 pipe low 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_IE_LO 77 /* X1 pipe low 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X0_TLS_IE_HI 78 /* X0 pipe high 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_IE_HI 79 /* X1 pipe high 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X0_TLS_IE_HA 80 /* X0 pipe ha() 16-bit TLS IE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_IE_HA 81 /* X1 pipe ha() 16-bit TLS IE offset */ -+#define R_TILEPRO_TLS_DTPMOD32 82 /* ID of module containing symbol */ -+#define R_TILEPRO_TLS_DTPOFF32 83 /* Offset in TLS block */ -+#define R_TILEPRO_TLS_TPOFF32 84 /* Offset in static TLS block */ -+#define R_TILEPRO_IMM16_X0_TLS_LE 85 /* X0 pipe 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_LE 86 /* X1 pipe 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X0_TLS_LE_LO 87 /* X0 pipe low 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_LE_LO 88 /* X1 pipe low 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X0_TLS_LE_HI 89 /* X0 pipe high 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_LE_HI 90 /* X1 pipe high 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X0_TLS_LE_HA 91 /* X0 pipe ha() 16-bit TLS LE offset */ -+#define R_TILEPRO_IMM16_X1_TLS_LE_HA 92 /* X1 pipe ha() 16-bit TLS LE offset */ -+ -+#define R_TILEPRO_GNU_VTINHERIT 128 /* GNU C++ vtable hierarchy */ -+#define R_TILEPRO_GNU_VTENTRY 129 /* GNU C++ vtable member usage */ -+ -+#define R_TILEPRO_NUM 130 -+ -+ -+/* TILE-Gx relocations. */ -+#define R_TILEGX_NONE 0 /* No reloc */ -+#define R_TILEGX_64 1 /* Direct 64 bit */ -+#define R_TILEGX_32 2 /* Direct 32 bit */ -+#define R_TILEGX_16 3 /* Direct 16 bit */ -+#define R_TILEGX_8 4 /* Direct 8 bit */ -+#define R_TILEGX_64_PCREL 5 /* PC relative 64 bit */ -+#define R_TILEGX_32_PCREL 6 /* PC relative 32 bit */ -+#define R_TILEGX_16_PCREL 7 /* PC relative 16 bit */ -+#define R_TILEGX_8_PCREL 8 /* PC relative 8 bit */ -+#define R_TILEGX_HW0 9 /* hword 0 16-bit */ -+#define R_TILEGX_HW1 10 /* hword 1 16-bit */ -+#define R_TILEGX_HW2 11 /* hword 2 16-bit */ -+#define R_TILEGX_HW3 12 /* hword 3 16-bit */ -+#define R_TILEGX_HW0_LAST 13 /* last hword 0 16-bit */ -+#define R_TILEGX_HW1_LAST 14 /* last hword 1 16-bit */ -+#define R_TILEGX_HW2_LAST 15 /* last hword 2 16-bit */ -+#define R_TILEGX_COPY 16 /* Copy relocation */ -+#define R_TILEGX_GLOB_DAT 17 /* Create GOT entry */ -+#define R_TILEGX_JMP_SLOT 18 /* Create PLT entry */ -+#define R_TILEGX_RELATIVE 19 /* Adjust by program base */ -+#define R_TILEGX_BROFF_X1 20 /* X1 pipe branch offset */ -+#define R_TILEGX_JUMPOFF_X1 21 /* X1 pipe jump offset */ -+#define R_TILEGX_JUMPOFF_X1_PLT 22 /* X1 pipe jump offset to PLT */ -+#define R_TILEGX_IMM8_X0 23 /* X0 pipe 8-bit */ -+#define R_TILEGX_IMM8_Y0 24 /* Y0 pipe 8-bit */ -+#define R_TILEGX_IMM8_X1 25 /* X1 pipe 8-bit */ -+#define R_TILEGX_IMM8_Y1 26 /* Y1 pipe 8-bit */ -+#define R_TILEGX_DEST_IMM8_X1 27 /* X1 pipe destination 8-bit */ -+#define R_TILEGX_MT_IMM14_X1 28 /* X1 pipe mtspr */ -+#define R_TILEGX_MF_IMM14_X1 29 /* X1 pipe mfspr */ -+#define R_TILEGX_MMSTART_X0 30 /* X0 pipe mm "start" */ -+#define R_TILEGX_MMEND_X0 31 /* X0 pipe mm "end" */ -+#define R_TILEGX_SHAMT_X0 32 /* X0 pipe shift amount */ -+#define R_TILEGX_SHAMT_X1 33 /* X1 pipe shift amount */ -+#define R_TILEGX_SHAMT_Y0 34 /* Y0 pipe shift amount */ -+#define R_TILEGX_SHAMT_Y1 35 /* Y1 pipe shift amount */ -+#define R_TILEGX_IMM16_X0_HW0 36 /* X0 pipe hword 0 */ -+#define R_TILEGX_IMM16_X1_HW0 37 /* X1 pipe hword 0 */ -+#define R_TILEGX_IMM16_X0_HW1 38 /* X0 pipe hword 1 */ -+#define R_TILEGX_IMM16_X1_HW1 39 /* X1 pipe hword 1 */ -+#define R_TILEGX_IMM16_X0_HW2 40 /* X0 pipe hword 2 */ -+#define R_TILEGX_IMM16_X1_HW2 41 /* X1 pipe hword 2 */ -+#define R_TILEGX_IMM16_X0_HW3 42 /* X0 pipe hword 3 */ -+#define R_TILEGX_IMM16_X1_HW3 43 /* X1 pipe hword 3 */ -+#define R_TILEGX_IMM16_X0_HW0_LAST 44 /* X0 pipe last hword 0 */ -+#define R_TILEGX_IMM16_X1_HW0_LAST 45 /* X1 pipe last hword 0 */ -+#define R_TILEGX_IMM16_X0_HW1_LAST 46 /* X0 pipe last hword 1 */ -+#define R_TILEGX_IMM16_X1_HW1_LAST 47 /* X1 pipe last hword 1 */ -+#define R_TILEGX_IMM16_X0_HW2_LAST 48 /* X0 pipe last hword 2 */ -+#define R_TILEGX_IMM16_X1_HW2_LAST 49 /* X1 pipe last hword 2 */ -+#define R_TILEGX_IMM16_X0_HW0_PCREL 50 /* X0 pipe PC relative hword 0 */ -+#define R_TILEGX_IMM16_X1_HW0_PCREL 51 /* X1 pipe PC relative hword 0 */ -+#define R_TILEGX_IMM16_X0_HW1_PCREL 52 /* X0 pipe PC relative hword 1 */ -+#define R_TILEGX_IMM16_X1_HW1_PCREL 53 /* X1 pipe PC relative hword 1 */ -+#define R_TILEGX_IMM16_X0_HW2_PCREL 54 /* X0 pipe PC relative hword 2 */ -+#define R_TILEGX_IMM16_X1_HW2_PCREL 55 /* X1 pipe PC relative hword 2 */ -+#define R_TILEGX_IMM16_X0_HW3_PCREL 56 /* X0 pipe PC relative hword 3 */ -+#define R_TILEGX_IMM16_X1_HW3_PCREL 57 /* X1 pipe PC relative hword 3 */ -+#define R_TILEGX_IMM16_X0_HW0_LAST_PCREL 58 /* X0 pipe PC-rel last hword 0 */ -+#define R_TILEGX_IMM16_X1_HW0_LAST_PCREL 59 /* X1 pipe PC-rel last hword 0 */ -+#define R_TILEGX_IMM16_X0_HW1_LAST_PCREL 60 /* X0 pipe PC-rel last hword 1 */ -+#define R_TILEGX_IMM16_X1_HW1_LAST_PCREL 61 /* X1 pipe PC-rel last hword 1 */ -+#define R_TILEGX_IMM16_X0_HW2_LAST_PCREL 62 /* X0 pipe PC-rel last hword 2 */ -+#define R_TILEGX_IMM16_X1_HW2_LAST_PCREL 63 /* X1 pipe PC-rel last hword 2 */ -+#define R_TILEGX_IMM16_X0_HW0_GOT 64 /* X0 pipe hword 0 GOT offset */ -+#define R_TILEGX_IMM16_X1_HW0_GOT 65 /* X1 pipe hword 0 GOT offset */ -+/* Relocs 66-71 are currently not defined. */ -+#define R_TILEGX_IMM16_X0_HW0_LAST_GOT 72 /* X0 pipe last hword 0 GOT offset */ -+#define R_TILEGX_IMM16_X1_HW0_LAST_GOT 73 /* X1 pipe last hword 0 GOT offset */ -+#define R_TILEGX_IMM16_X0_HW1_LAST_GOT 74 /* X0 pipe last hword 1 GOT offset */ -+#define R_TILEGX_IMM16_X1_HW1_LAST_GOT 75 /* X1 pipe last hword 1 GOT offset */ -+/* Relocs 76-77 are currently not defined. */ -+#define R_TILEGX_IMM16_X0_HW0_TLS_GD 78 /* X0 pipe hword 0 TLS GD offset */ -+#define R_TILEGX_IMM16_X1_HW0_TLS_GD 79 /* X1 pipe hword 0 TLS GD offset */ -+#define R_TILEGX_IMM16_X0_HW0_TLS_LE 80 /* X0 pipe hword 0 TLS LE offset */ -+#define R_TILEGX_IMM16_X1_HW0_TLS_LE 81 /* X1 pipe hword 0 TLS LE offset */ -+#define R_TILEGX_IMM16_X0_HW0_LAST_TLS_LE 82 /* X0 pipe last hword 0 LE off */ -+#define R_TILEGX_IMM16_X1_HW0_LAST_TLS_LE 83 /* X1 pipe last hword 0 LE off */ -+#define R_TILEGX_IMM16_X0_HW1_LAST_TLS_LE 84 /* X0 pipe last hword 1 LE off */ -+#define R_TILEGX_IMM16_X1_HW1_LAST_TLS_LE 85 /* X1 pipe last hword 1 LE off */ -+#define R_TILEGX_IMM16_X0_HW0_LAST_TLS_GD 86 /* X0 pipe last hword 0 GD off */ -+#define R_TILEGX_IMM16_X1_HW0_LAST_TLS_GD 87 /* X1 pipe last hword 0 GD off */ -+#define R_TILEGX_IMM16_X0_HW1_LAST_TLS_GD 88 /* X0 pipe last hword 1 GD off */ -+#define R_TILEGX_IMM16_X1_HW1_LAST_TLS_GD 89 /* X1 pipe last hword 1 GD off */ -+/* Relocs 90-91 are currently not defined. */ -+#define R_TILEGX_IMM16_X0_HW0_TLS_IE 92 /* X0 pipe hword 0 TLS IE offset */ -+#define R_TILEGX_IMM16_X1_HW0_TLS_IE 93 /* X1 pipe hword 0 TLS IE offset */ -+/* Relocs 94-99 are currently not defined. */ -+#define R_TILEGX_IMM16_X0_HW0_LAST_TLS_IE 100 /* X0 pipe last hword 0 IE off */ -+#define R_TILEGX_IMM16_X1_HW0_LAST_TLS_IE 101 /* X1 pipe last hword 0 IE off */ -+#define R_TILEGX_IMM16_X0_HW1_LAST_TLS_IE 102 /* X0 pipe last hword 1 IE off */ -+#define R_TILEGX_IMM16_X1_HW1_LAST_TLS_IE 103 /* X1 pipe last hword 1 IE off */ -+/* Relocs 104-105 are currently not defined. */ -+#define R_TILEGX_TLS_DTPMOD64 106 /* 64-bit ID of symbol's module */ -+#define R_TILEGX_TLS_DTPOFF64 107 /* 64-bit offset in TLS block */ -+#define R_TILEGX_TLS_TPOFF64 108 /* 64-bit offset in static TLS block */ -+#define R_TILEGX_TLS_DTPMOD32 109 /* 32-bit ID of symbol's module */ -+#define R_TILEGX_TLS_DTPOFF32 110 /* 32-bit offset in TLS block */ -+#define R_TILEGX_TLS_TPOFF32 111 /* 32-bit offset in static TLS block */ -+#define R_TILEGX_TLS_GD_CALL 112 /* "jal" for TLS GD */ -+#define R_TILEGX_IMM8_X0_TLS_GD_ADD 113 /* X0 pipe "addi" for TLS GD */ -+#define R_TILEGX_IMM8_X1_TLS_GD_ADD 114 /* X1 pipe "addi" for TLS GD */ -+#define R_TILEGX_IMM8_Y0_TLS_GD_ADD 115 /* Y0 pipe "addi" for TLS GD */ -+#define R_TILEGX_IMM8_Y1_TLS_GD_ADD 116 /* Y1 pipe "addi" for TLS GD */ -+#define R_TILEGX_TLS_IE_LOAD 117 /* "ld_tls" for TLS IE */ -+#define R_TILEGX_IMM8_X0_TLS_ADD 118 /* X0 pipe "addi" for TLS GD/IE */ -+#define R_TILEGX_IMM8_X1_TLS_ADD 119 /* X1 pipe "addi" for TLS GD/IE */ -+#define R_TILEGX_IMM8_Y0_TLS_ADD 120 /* Y0 pipe "addi" for TLS GD/IE */ -+#define R_TILEGX_IMM8_Y1_TLS_ADD 121 /* Y1 pipe "addi" for TLS GD/IE */ -+ -+#define R_TILEGX_GNU_VTINHERIT 128 /* GNU C++ vtable hierarchy */ -+#define R_TILEGX_GNU_VTENTRY 129 /* GNU C++ vtable member usage */ -+ -+#define R_TILEGX_NUM 130 -+ -+#endif /* elf.h */ ---- a/scripts/mod/mk_elfconfig.c -+++ b/scripts/mod/mk_elfconfig.c -@@ -2,7 +2,11 @@ - #include - #include - #include -+#ifndef __APPLE__ - #include -+#else -+#include "elf.h" -+#endif - - int - main(int argc, char **argv) ---- a/scripts/mod/modpost.h -+++ b/scripts/mod/modpost.h -@@ -8,7 +8,11 @@ - #include - #include - #include -+#if !(defined(__APPLE__) || defined(__CYGWIN__)) - #include -+#else -+#include "elf.h" -+#endif - - #include "elfconfig.h" - diff --git a/root/target/linux/generic/hack-5.4/211-darwin-uuid-typedef-clash.patch b/root/target/linux/generic/hack-5.4/211-darwin-uuid-typedef-clash.patch deleted file mode 100755 index 50a62271..00000000 --- a/root/target/linux/generic/hack-5.4/211-darwin-uuid-typedef-clash.patch +++ /dev/null @@ -1,22 +0,0 @@ -From e44fc2af1ddc452b6659d08c16973d65c73b7d0a Mon Sep 17 00:00:00 2001 -From: Kevin Darbyshire-Bryant -Date: Wed, 5 Feb 2020 18:36:43 +0000 -Subject: [PATCH] file2alias: build on macos - -Signed-off-by: Kevin Darbyshire-Bryant ---- - scripts/mod/file2alias.c | 3 +++ - 1 file changed, 3 insertions(+) - ---- a/scripts/mod/file2alias.c -+++ b/scripts/mod/file2alias.c -@@ -38,6 +38,9 @@ typedef struct { - __u8 b[16]; - } guid_t; - -+#ifdef __APPLE__ -+#define uuid_t compat_uuid_t -+#endif - /* backwards compatibility, don't use in new code */ - typedef struct { - __u8 b[16]; diff --git a/root/target/linux/generic/hack-5.4/212-tools_portability.patch b/root/target/linux/generic/hack-5.4/212-tools_portability.patch deleted file mode 100755 index 0d8eb6fb..00000000 --- a/root/target/linux/generic/hack-5.4/212-tools_portability.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 48232d3d931c95953ce2ddfe7da7bb164aef6a73 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:03:16 +0200 -Subject: fix portability of some includes files in tools/ used on the host - -Signed-off-by: Felix Fietkau ---- - tools/include/tools/be_byteshift.h | 4 ++++ - tools/include/tools/le_byteshift.h | 4 ++++ - tools/include/tools/linux_types.h | 22 ++++++++++++++++++++++ - 3 files changed, 30 insertions(+) - create mode 100644 tools/include/tools/linux_types.h - ---- a/tools/include/tools/be_byteshift.h -+++ b/tools/include/tools/be_byteshift.h -@@ -2,6 +2,10 @@ - #ifndef _TOOLS_BE_BYTESHIFT_H - #define _TOOLS_BE_BYTESHIFT_H - -+#ifndef __linux__ -+#include "linux_types.h" -+#endif -+ - #include - - static inline uint16_t __get_unaligned_be16(const uint8_t *p) ---- a/tools/include/tools/le_byteshift.h -+++ b/tools/include/tools/le_byteshift.h -@@ -2,6 +2,10 @@ - #ifndef _TOOLS_LE_BYTESHIFT_H - #define _TOOLS_LE_BYTESHIFT_H - -+#ifndef __linux__ -+#include "linux_types.h" -+#endif -+ - #include - - static inline uint16_t __get_unaligned_le16(const uint8_t *p) ---- /dev/null -+++ b/tools/include/tools/linux_types.h -@@ -0,0 +1,26 @@ -+#ifndef __LINUX_TYPES_H -+#define __LINUX_TYPES_H -+ -+#include -+ -+typedef int8_t __s8; -+typedef uint8_t __u8; -+typedef uint8_t __be8; -+typedef uint8_t __le8; -+ -+typedef int16_t __s16; -+typedef uint16_t __u16; -+typedef uint16_t __be16; -+typedef uint16_t __le16; -+ -+typedef int32_t __s32; -+typedef uint32_t __u32; -+typedef uint32_t __be32; -+typedef uint32_t __le32; -+ -+typedef int64_t __s64; -+typedef uint64_t __u64; -+typedef uint64_t __be64; -+typedef uint64_t __le64; -+ -+#endif ---- a/tools/include/linux/types.h -+++ b/tools/include/linux/types.h -@@ -7,8 +7,12 @@ - #include - - #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ -+#ifndef __linux__ -+#include -+#else - #include - #include -+#endif - - struct page; - struct kmem_cache; ---- a/tools/perf/pmu-events/jevents.c -+++ b/tools/perf/pmu-events/jevents.c -@@ -1,4 +1,6 @@ -+#ifdef __linux__ - #define _XOPEN_SOURCE 500 /* needed for nftw() */ -+#endif - #define _GNU_SOURCE /* needed for asprintf() */ - - /* Parse event JSON files */ -@@ -35,6 +37,7 @@ - #include - #include - #include -+#include - #include - #include - #include ---- a/tools/perf/pmu-events/json.c -+++ b/tools/perf/pmu-events/json.c -@@ -38,7 +38,6 @@ - #include - #include "jsmn.h" - #include "json.h" --#include - - - static char *mapfile(const char *fn, size_t *size) diff --git a/root/target/linux/generic/hack-5.4/214-spidev_h_portability.patch b/root/target/linux/generic/hack-5.4/214-spidev_h_portability.patch deleted file mode 100755 index 415e9a42..00000000 --- a/root/target/linux/generic/hack-5.4/214-spidev_h_portability.patch +++ /dev/null @@ -1,24 +0,0 @@ -From be9be95ff10e16a5b4ad36f903978d0cc5747024 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:04:08 +0200 -Subject: kernel: fix linux/spi/spidev.h portability issues with musl - -Felix will try to get this define included into musl - -lede-commit: 795e7cf60de19e7a076a46874fab7bb88b43bbff -Signed-off-by: Felix Fietkau ---- - include/uapi/linux/spi/spidev.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/include/uapi/linux/spi/spidev.h -+++ b/include/uapi/linux/spi/spidev.h -@@ -117,7 +117,7 @@ struct spi_ioc_transfer { - - /* not all platforms use or _IOC_TYPECHECK() ... */ - #define SPI_MSGSIZE(N) \ -- ((((N)*(sizeof (struct spi_ioc_transfer))) < (1 << _IOC_SIZEBITS)) \ -+ ((((N)*(sizeof (struct spi_ioc_transfer))) < (1 << 13)) \ - ? ((N)*(sizeof (struct spi_ioc_transfer))) : 0) - #define SPI_IOC_MESSAGE(N) _IOW(SPI_IOC_MAGIC, 0, char[SPI_MSGSIZE(N)]) - diff --git a/root/target/linux/generic/hack-5.4/220-arm-gc_sections.patch b/root/target/linux/generic/hack-5.4/220-arm-gc_sections.patch deleted file mode 100755 index 14e24617..00000000 --- a/root/target/linux/generic/hack-5.4/220-arm-gc_sections.patch +++ /dev/null @@ -1,138 +0,0 @@ -From e3d8676f5722b7622685581e06e8f53e6138e3ab Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 15 Jul 2017 23:42:36 +0200 -Subject: use -ffunction-sections, -fdata-sections and --gc-sections - -In combination with kernel symbol export stripping this significantly reduces -the kernel image size. Used on both ARM and MIPS architectures. - -Signed-off-by: Felix Fietkau -Signed-off-by: Jonas Gorski -Signed-off-by: Gabor Juhos ---- - Makefile | 10 +++---- - arch/arm/Kconfig | 1 + - arch/arm/boot/compressed/Makefile | 1 + - arch/arm/kernel/vmlinux.lds.S | 26 ++++++++-------- - arch/mips/Kconfig | 1 + - arch/mips/kernel/vmlinux.lds.S | 4 +-- - include/asm-generic/vmlinux.lds.h | 63 ++++++++++++++++++++------------------- - 7 files changed, 55 insertions(+), 51 deletions(-) - ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig -@@ -112,6 +112,7 @@ config ARM - select HAVE_UID16 - select HAVE_VIRT_CPU_ACCOUNTING_GEN - select IRQ_FORCED_THREADING -+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select MODULES_USE_ELF_REL - select NEED_DMA_MAP_STATE - select OF_EARLY_FLATTREE if OF ---- a/arch/arm/boot/compressed/Makefile -+++ b/arch/arm/boot/compressed/Makefile -@@ -108,6 +108,7 @@ ifeq ($(CONFIG_FUNCTION_TRACER),y) - ORIG_CFLAGS := $(KBUILD_CFLAGS) - KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) - endif -+KBUILD_CFLAGS_KERNEL := $(patsubst -f%-sections,,$(KBUILD_CFLAGS_KERNEL)) - - # -fstack-protector-strong triggers protection checks in this code, - # but it is being used too early to link to meaningful stack_chk logic. ---- a/arch/arm/kernel/vmlinux.lds.S -+++ b/arch/arm/kernel/vmlinux.lds.S -@@ -73,7 +73,7 @@ SECTIONS - . = ALIGN(4); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; -- ARM_MMU_KEEP(*(__ex_table)) -+ KEEP(*(__ex_table)) - __stop___ex_table = .; - } - -@@ -100,24 +100,24 @@ SECTIONS - } - .init.arch.info : { - __arch_info_begin = .; -- *(.arch.info.init) -+ KEEP(*(.arch.info.init)) - __arch_info_end = .; - } - .init.tagtable : { - __tagtable_begin = .; -- *(.taglist.init) -+ KEEP(*(.taglist.init)) - __tagtable_end = .; - } - #ifdef CONFIG_SMP_ON_UP - .init.smpalt : { - __smpalt_begin = .; -- *(.alt.smp.init) -+ KEEP(*(.alt.smp.init)) - __smpalt_end = .; - } - #endif - .init.pv_table : { - __pv_table_begin = .; -- *(.pv_table) -+ KEEP(*(.pv_table)) - __pv_table_end = .; - } - ---- a/arch/arm/kernel/vmlinux.lds.h -+++ b/arch/arm/kernel/vmlinux.lds.h -@@ -28,7 +28,7 @@ - #define PROC_INFO \ - . = ALIGN(4); \ - __proc_info_begin = .; \ -- *(.proc.info.init) \ -+ KEEP(*(.proc.info.init)) \ - __proc_info_end = .; - - #define HYPERVISOR_TEXT \ -@@ -39,11 +39,11 @@ - #define IDMAP_TEXT \ - ALIGN_FUNCTION(); \ - __idmap_text_start = .; \ -- *(.idmap.text) \ -+ KEEP(*(.idmap.text)) \ - __idmap_text_end = .; \ - . = ALIGN(PAGE_SIZE); \ - __hyp_idmap_text_start = .; \ -- *(.hyp.idmap.text) \ -+ KEEP(*(.hyp.idmap.text)) \ - __hyp_idmap_text_end = .; - - #define ARM_DISCARD \ -@@ -86,12 +86,12 @@ - . = ALIGN(8); \ - .ARM.unwind_idx : { \ - __start_unwind_idx = .; \ -- *(.ARM.exidx*) \ -+ KEEP(*(.ARM.exidx*)) \ - __stop_unwind_idx = .; \ - } \ - .ARM.unwind_tab : { \ - __start_unwind_tab = .; \ -- *(.ARM.extab*) \ -+ KEEP(*(.ARM.extab*)) \ - __stop_unwind_tab = .; \ - } - -@@ -102,14 +102,14 @@ - #define ARM_VECTORS \ - __vectors_start = .; \ - .vectors 0xffff0000 : AT(__vectors_start) { \ -- *(.vectors) \ -+ KEEP(*(.vectors)) \ - } \ - . = __vectors_start + SIZEOF(.vectors); \ - __vectors_end = .; \ - \ - __stubs_start = .; \ - .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) { \ -- *(.stubs) \ -+ KEEP(*(.stubs)) \ - } \ - . = __stubs_start + SIZEOF(.stubs); \ - __stubs_end = .; \ diff --git a/root/target/linux/generic/hack-5.4/221-module_exports.patch b/root/target/linux/generic/hack-5.4/221-module_exports.patch deleted file mode 100755 index 47f40ac5..00000000 --- a/root/target/linux/generic/hack-5.4/221-module_exports.patch +++ /dev/null @@ -1,109 +0,0 @@ -From b14784e7883390c20ed3ff904892255404a5914b Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:05:53 +0200 -Subject: add an optional config option for stripping all unnecessary symbol exports from the kernel image - -lede-commit: bb5a40c64b7c4f4848509fa0a6625055fc9e66cc -Signed-off-by: Felix Fietkau ---- - include/asm-generic/vmlinux.lds.h | 18 +++++++++++++++--- - include/linux/export.h | 9 ++++++++- - scripts/Makefile.build | 2 +- - 3 files changed, 24 insertions(+), 5 deletions(-) - ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -54,6 +54,16 @@ - #define LOAD_OFFSET 0 - #endif - -+#ifndef SYMTAB_KEEP -+#define SYMTAB_KEEP KEEP(*(SORT(___ksymtab+*))) -+#define SYMTAB_KEEP_GPL KEEP(*(SORT(___ksymtab_gpl+*))) -+#endif -+ -+#ifndef SYMTAB_DISCARD -+#define SYMTAB_DISCARD -+#define SYMTAB_DISCARD_GPL -+#endif -+ - /* Align . to a 8 byte boundary equals to maximum function alignment. */ - #define ALIGN_FUNCTION() . = ALIGN(8) - -@@ -407,14 +417,14 @@ - /* Kernel symbol table: Normal symbols */ \ - __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ - __start___ksymtab = .; \ -- KEEP(*(SORT(___ksymtab+*))) \ -+ SYMTAB_KEEP \ - __stop___ksymtab = .; \ - } \ - \ - /* Kernel symbol table: GPL-only symbols */ \ - __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \ - __start___ksymtab_gpl = .; \ -- KEEP(*(SORT(___ksymtab_gpl+*))) \ -+ SYMTAB_KEEP_GPL \ - __stop___ksymtab_gpl = .; \ - } \ - \ -@@ -476,7 +486,7 @@ - \ - /* Kernel symbol table: strings */ \ - __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \ -- *(__ksymtab_strings) \ -+ *(__ksymtab_strings+*) \ - } \ - \ - /* __*init sections */ \ -@@ -905,6 +915,8 @@ - EXIT_TEXT \ - EXIT_DATA \ - EXIT_CALL \ -+ SYMTAB_DISCARD \ -+ SYMTAB_DISCARD_GPL \ - *(.discard) \ - *(.discard.*) \ - *(.modinfo) \ ---- a/include/linux/export.h -+++ b/include/linux/export.h -@@ -98,18 +98,26 @@ struct kernel_symbol { - - #else - -+#ifdef MODULE -+#define __EXPORT_SUFFIX(sym) -+#else -+#define __EXPORT_SUFFIX(sym) "+" #sym -+#endif -+ - #define ___export_symbol_common(sym, sec) \ - extern typeof(sym) sym; \ - __CRC_SYMBOL(sym, sec); \ - static const char __kstrtab_##sym[] \ -- __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ -+ __attribute__((section("__ksymtab_strings" \ -+ __EXPORT_SUFFIX(sym)), used, aligned(1))) \ - = #sym \ - - /* For every exported symbol, place a struct in the __ksymtab section */ - #define ___EXPORT_SYMBOL_NS(sym, sec, ns) \ - ___export_symbol_common(sym, sec); \ - static const char __kstrtabns_##sym[] \ -- __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ -+ __attribute__((section("__ksymtab_strings" \ -+ __EXPORT_SUFFIX(sym)), used, aligned(1))) \ - = #ns; \ - __KSYMTAB_ENTRY_NS(sym, sec) - ---- a/scripts/Makefile.build -+++ b/scripts/Makefile.build -@@ -350,7 +350,7 @@ targets += $(extra-y) $(MAKECMDGOALS) $( - # Linker scripts preprocessor (.lds.S -> .lds) - # --------------------------------------------------------------------------- - quiet_cmd_cpp_lds_S = LDS $@ -- cmd_cpp_lds_S = $(CPP) $(cpp_flags) -P -U$(ARCH) \ -+ cmd_cpp_lds_S = $(CPP) $(EXTRA_LDSFLAGS) $(cpp_flags) -P -U$(ARCH) \ - -D__ASSEMBLY__ -DLINKER_SCRIPT -o $@ $< - - $(obj)/%.lds: $(src)/%.lds.S FORCE diff --git a/root/target/linux/generic/hack-5.4/230-openwrt_lzma_options.patch b/root/target/linux/generic/hack-5.4/230-openwrt_lzma_options.patch deleted file mode 100755 index 809ccbc1..00000000 --- a/root/target/linux/generic/hack-5.4/230-openwrt_lzma_options.patch +++ /dev/null @@ -1,71 +0,0 @@ -From b3d00b452467f621317953d9e4c6f9ae8dcfd271 Mon Sep 17 00:00:00 2001 -From: Imre Kaloz -Date: Fri, 7 Jul 2017 17:06:55 +0200 -Subject: use the openwrt lzma options for now - -lede-commit: 548de949f392049420a6a1feeef118b30ab8ea8c -Signed-off-by: Imre Kaloz ---- - lib/decompress.c | 1 + - scripts/Makefile.lib | 2 +- - usr/gen_initramfs_list.sh | 10 +++++----- - 3 files changed, 7 insertions(+), 6 deletions(-) - ---- a/lib/decompress.c -+++ b/lib/decompress.c -@@ -49,6 +49,7 @@ static const struct compress_format comp - { {0x1f, 0x9e}, "gzip", gunzip }, - { {0x42, 0x5a}, "bzip2", bunzip2 }, - { {0x5d, 0x00}, "lzma", unlzma }, -+ { {0x6d, 0x00}, "lzma-openwrt", unlzma }, - { {0xfd, 0x37}, "xz", unxz }, - { {0x89, 0x4c}, "lzo", unlzo }, - { {0x02, 0x21}, "lz4", unlz4 }, ---- a/scripts/Makefile.lib -+++ b/scripts/Makefile.lib -@@ -328,7 +328,7 @@ quiet_cmd_bzip2 = BZIP2 $@ - # --------------------------------------------------------------------------- - - quiet_cmd_lzma = LZMA $@ -- cmd_lzma = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@ -+ cmd_lzma = { cat $(real-prereqs) | $(LZMA) e -d20 -lc1 -lp2 -pb2 -eos -si -so; $(size_append); } > $@ - - quiet_cmd_lzo = LZO $@ - cmd_lzo = { cat $(real-prereqs) | $(KLZOP) -9; $(size_append); } > $@ ---- a/usr/gen_initramfs_list.sh -+++ b/usr/gen_initramfs_list.sh -@@ -229,7 +229,7 @@ cpio_list= - output="/dev/stdout" - output_file="" - is_cpio_compressed= --compr="gzip -n -9 -f" -+compr="gzip -n -9 -f -" - - arg="$1" - case "$arg" in -@@ -245,13 +245,13 @@ case "$arg" in - output=${cpio_list} - echo "$output_file" | grep -q "\.gz$" \ - && [ -x "`which gzip 2> /dev/null`" ] \ -- && compr="gzip -n -9 -f" -+ && compr="gzip -n -9 -f -" - echo "$output_file" | grep -q "\.bz2$" \ - && [ -x "`which bzip2 2> /dev/null`" ] \ -- && compr="bzip2 -9 -f" -+ && compr="bzip2 -9 -f -" - echo "$output_file" | grep -q "\.lzma$" \ - && [ -x "`which lzma 2> /dev/null`" ] \ -- && compr="lzma -9 -f" -+ && compr="lzma e -d20 -lc1 -lp2 -pb2 -eos -si -so" - echo "$output_file" | grep -q "\.xz$" \ - && [ -x "`which xz 2> /dev/null`" ] \ - && compr="xz --check=crc32 --lzma2=dict=1MiB" -@@ -320,7 +320,7 @@ if [ ! -z ${output_file} ]; then - if [ "${is_cpio_compressed}" = "compressed" ]; then - cat ${cpio_tfile} > ${output_file} - else -- (cat ${cpio_tfile} | ${compr} - > ${output_file}) \ -+ (cat ${cpio_tfile} | ${compr} > ${output_file}) \ - || (rm -f ${output_file} ; false) - fi - [ -z ${cpio_file} ] && rm ${cpio_tfile} diff --git a/root/target/linux/generic/hack-5.4/249-udp-tunnel-selection.patch b/root/target/linux/generic/hack-5.4/249-udp-tunnel-selection.patch deleted file mode 100755 index 2c74298d..00000000 --- a/root/target/linux/generic/hack-5.4/249-udp-tunnel-selection.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -315,7 +315,7 @@ config NET_IPVTI - on top. - - config NET_UDP_TUNNEL -- tristate -+ tristate "IP: UDP tunneling support" - select NET_IP_TUNNEL - default n - diff --git a/root/target/linux/generic/hack-5.4/250-netfilter_depends.patch b/root/target/linux/generic/hack-5.4/250-netfilter_depends.patch deleted file mode 100755 index d03cb531..00000000 --- a/root/target/linux/generic/hack-5.4/250-netfilter_depends.patch +++ /dev/null @@ -1,27 +0,0 @@ -From: Felix Fietkau -Subject: hack: net: remove bogus netfilter dependencies - -lede-commit: 589d2a377dee27d206fc3725325309cf649e4df6 -Signed-off-by: Felix Fietkau ---- - net/netfilter/Kconfig | 2 -- - 1 file changed, 2 deletions(-) - ---- a/net/netfilter/Kconfig -+++ b/net/netfilter/Kconfig -@@ -228,7 +228,6 @@ config NF_CONNTRACK_FTP - - config NF_CONNTRACK_H323 - tristate "H.323 protocol support" -- depends on IPV6 || IPV6=n - depends on NETFILTER_ADVANCED - help - H.323 is a VoIP signalling protocol from ITU-T. As one of the most -@@ -1088,7 +1087,6 @@ config NETFILTER_XT_TARGET_SECMARK - - config NETFILTER_XT_TARGET_TCPMSS - tristate '"TCPMSS" target support' -- depends on IPV6 || IPV6=n - default m if NETFILTER_ADVANCED=n - ---help--- - This option adds a `TCPMSS' target, which allows you to alter the diff --git a/root/target/linux/generic/hack-5.4/251-sound_kconfig.patch b/root/target/linux/generic/hack-5.4/251-sound_kconfig.patch deleted file mode 100755 index f593417c..00000000 --- a/root/target/linux/generic/hack-5.4/251-sound_kconfig.patch +++ /dev/null @@ -1,199 +0,0 @@ -From da3c50704f14132f4adf80d48e9a4cd5d46e54c9 Mon Sep 17 00:00:00 2001 -From: John Crispin -Date: Fri, 7 Jul 2017 17:09:21 +0200 -Subject: kconfig: owrt specifc dependencies - -Signed-off-by: John Crispin ---- - crypto/Kconfig | 10 +++++----- - drivers/bcma/Kconfig | 1 + - drivers/ssb/Kconfig | 3 ++- - lib/Kconfig | 8 ++++---- - net/netfilter/Kconfig | 2 +- - net/wireless/Kconfig | 17 ++++++++++------- - sound/core/Kconfig | 4 ++-- - 7 files changed, 25 insertions(+), 20 deletions(-) - ---- a/crypto/Kconfig -+++ b/crypto/Kconfig -@@ -33,7 +33,7 @@ config CRYPTO_FIPS - this is. - - config CRYPTO_ALGAPI -- tristate -+ tristate "ALGAPI" - select CRYPTO_ALGAPI2 - help - This option provides the API for cryptographic algorithms. -@@ -42,7 +42,7 @@ config CRYPTO_ALGAPI2 - tristate - - config CRYPTO_AEAD -- tristate -+ tristate "AEAD" - select CRYPTO_AEAD2 - select CRYPTO_ALGAPI - -@@ -53,7 +53,7 @@ config CRYPTO_AEAD2 - select CRYPTO_RNG2 - - config CRYPTO_BLKCIPHER -- tristate -+ tristate "BLKCIPHER" - select CRYPTO_BLKCIPHER2 - select CRYPTO_ALGAPI - -@@ -63,7 +63,7 @@ config CRYPTO_BLKCIPHER2 - select CRYPTO_RNG2 - - config CRYPTO_HASH -- tristate -+ tristate "HASH" - select CRYPTO_HASH2 - select CRYPTO_ALGAPI - -@@ -72,7 +72,7 @@ config CRYPTO_HASH2 - select CRYPTO_ALGAPI2 - - config CRYPTO_RNG -- tristate -+ tristate "RNG" - select CRYPTO_RNG2 - select CRYPTO_ALGAPI - ---- a/drivers/bcma/Kconfig -+++ b/drivers/bcma/Kconfig -@@ -16,6 +16,7 @@ if BCMA - # Support for Block-I/O. SELECT this from the driver that needs it. - config BCMA_BLOCKIO - bool -+ default y - - config BCMA_HOST_PCI_POSSIBLE - bool ---- a/drivers/ssb/Kconfig -+++ b/drivers/ssb/Kconfig -@@ -29,6 +29,7 @@ config SSB_SPROM - config SSB_BLOCKIO - bool - depends on SSB -+ default y - - config SSB_PCIHOST_POSSIBLE - bool -@@ -49,7 +50,7 @@ config SSB_PCIHOST - config SSB_B43_PCI_BRIDGE - bool - depends on SSB_PCIHOST -- default n -+ default y - - config SSB_PCMCIAHOST_POSSIBLE - bool ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -402,16 +402,16 @@ config BCH_CONST_T - # Textsearch support is select'ed if needed - # - config TEXTSEARCH -- bool -+ bool "Textsearch support" - - config TEXTSEARCH_KMP -- tristate -+ tristate "Textsearch KMP" - - config TEXTSEARCH_BM -- tristate -+ tristate "Textsearch BM" - - config TEXTSEARCH_FSM -- tristate -+ tristate "Textsearch FSM" - - config BTREE - bool ---- a/net/netfilter/Kconfig -+++ b/net/netfilter/Kconfig -@@ -11,7 +11,7 @@ config NETFILTER_INGRESS - infrastructure. - - config NETFILTER_NETLINK -- tristate -+ tristate "Netfilter NFNETLINK interface" - - config NETFILTER_FAMILY_BRIDGE - bool ---- a/net/wireless/Kconfig -+++ b/net/wireless/Kconfig -@@ -1,6 +1,6 @@ - # SPDX-License-Identifier: GPL-2.0-only - config WIRELESS_EXT -- bool -+ bool "Wireless extensions" - - config WEXT_CORE - def_bool y -@@ -12,10 +12,10 @@ config WEXT_PROC - depends on WEXT_CORE - - config WEXT_SPY -- bool -+ bool "WEXT_SPY" - - config WEXT_PRIV -- bool -+ bool "WEXT_PRIV" - - config CFG80211 - tristate "cfg80211 - wireless configuration API" -@@ -203,7 +203,7 @@ config CFG80211_WEXT_EXPORT - endif # CFG80211 - - config LIB80211 -- tristate -+ tristate "LIB80211" - default n - help - This options enables a library of common routines used -@@ -212,17 +212,17 @@ config LIB80211 - Drivers should select this themselves if needed. - - config LIB80211_CRYPT_WEP -- tristate -+ tristate "LIB80211_CRYPT_WEP" - select CRYPTO_LIB_ARC4 - - config LIB80211_CRYPT_CCMP -- tristate -+ tristate "LIB80211_CRYPT_CCMP" - select CRYPTO - select CRYPTO_AES - select CRYPTO_CCM - - config LIB80211_CRYPT_TKIP -- tristate -+ tristate "LIB80211_CRYPT_TKIP" - select CRYPTO_LIB_ARC4 - - config LIB80211_DEBUG ---- a/sound/core/Kconfig -+++ b/sound/core/Kconfig -@@ -17,7 +17,7 @@ config SND_DMAENGINE_PCM - tristate - - config SND_HWDEP -- tristate -+ tristate "Sound hardware support" - - config SND_SEQ_DEVICE - tristate -@@ -27,7 +27,7 @@ config SND_RAWMIDI - select SND_SEQ_DEVICE if SND_SEQUENCER != n - - config SND_COMPRESS_OFFLOAD -- tristate -+ tristate "Compression offloading support" - - config SND_JACK - bool diff --git a/root/target/linux/generic/hack-5.4/259-regmap_dynamic.patch b/root/target/linux/generic/hack-5.4/259-regmap_dynamic.patch deleted file mode 100755 index 812e1824..00000000 --- a/root/target/linux/generic/hack-5.4/259-regmap_dynamic.patch +++ /dev/null @@ -1,125 +0,0 @@ -From 811d9e2268a62b830cfe93cd8bc929afcb8b198b Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 15 Jul 2017 21:12:38 +0200 -Subject: kernel: move regmap bloat out of the kernel image if it is only being used in modules - -lede-commit: 96f39119815028073583e4fca3a9c5fe9141e998 -Signed-off-by: Felix Fietkau ---- - drivers/base/regmap/Kconfig | 15 ++++++++++----- - drivers/base/regmap/Makefile | 12 ++++++++---- - drivers/base/regmap/regmap.c | 3 +++ - include/linux/regmap.h | 2 +- - 4 files changed, 22 insertions(+), 10 deletions(-) - ---- a/drivers/base/regmap/Kconfig -+++ b/drivers/base/regmap/Kconfig -@@ -4,9 +4,8 @@ - # subsystems should select the appropriate symbols. - - config REGMAP -- default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SCCB || REGMAP_I3C) - select IRQ_DOMAIN if REGMAP_IRQ -- bool -+ tristate - - config REGCACHE_COMPRESSED - select LZO_COMPRESS -@@ -18,38 +17,49 @@ config REGMAP_AC97 - - config REGMAP_I2C - tristate -+ select REGMAP - depends on I2C - - config REGMAP_SLIMBUS - tristate -+ select REGMAP - depends on SLIMBUS - - config REGMAP_SPI - tristate -+ select REGMAP -+ depends on SPI_MASTER - depends on SPI - - config REGMAP_SPMI - tristate -+ select REGMAP - depends on SPMI - - config REGMAP_W1 - tristate -+ select REGMAP - depends on W1 - - config REGMAP_MMIO - tristate -+ select REGMAP - - config REGMAP_IRQ - bool -+ select REGMAP - - config REGMAP_SOUNDWIRE - tristate -+ select REGMAP - depends on SOUNDWIRE - - config REGMAP_SCCB - tristate -+ select REGMAP - depends on I2C - - config REGMAP_I3C - tristate -+ select REGMAP - depends on I3C ---- a/drivers/base/regmap/Makefile -+++ b/drivers/base/regmap/Makefile -@@ -2,10 +2,14 @@ - # For include/trace/define_trace.h to include trace.h - CFLAGS_regmap.o := -I$(src) - --obj-$(CONFIG_REGMAP) += regmap.o regcache.o --obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-flat.o --obj-$(CONFIG_REGCACHE_COMPRESSED) += regcache-lzo.o --obj-$(CONFIG_DEBUG_FS) += regmap-debugfs.o -+regmap-core-objs = regmap.o regcache.o regcache-rbtree.o regcache-flat.o -+ifdef CONFIG_DEBUG_FS -+regmap-core-objs += regmap-debugfs.o -+endif -+ifdef CONFIG_REGCACHE_COMPRESSED -+regmap-core-objs += regcache-lzo.o -+endif -+obj-$(CONFIG_REGMAP) += regmap-core.o - obj-$(CONFIG_REGMAP_AC97) += regmap-ac97.o - obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o - obj-$(CONFIG_REGMAP_SLIMBUS) += regmap-slimbus.o ---- a/drivers/base/regmap/regmap.c -+++ b/drivers/base/regmap/regmap.c -@@ -9,6 +9,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -3118,3 +3119,5 @@ static int __init regmap_initcall(void) - return 0; - } - postcore_initcall(regmap_initcall); -+ -+MODULE_LICENSE("GPL"); ---- a/include/linux/regmap.h -+++ b/include/linux/regmap.h -@@ -185,7 +185,7 @@ struct reg_sequence { - pollret ?: ((cond) ? 0 : -ETIMEDOUT); \ - }) - --#ifdef CONFIG_REGMAP -+#if IS_REACHABLE(CONFIG_REGMAP) - - enum regmap_endian { - /* Unspecified -> 0 -> Backwards compatible default */ diff --git a/root/target/linux/generic/hack-5.4/260-crypto_test_dependencies.patch b/root/target/linux/generic/hack-5.4/260-crypto_test_dependencies.patch deleted file mode 100755 index c1b0b855..00000000 --- a/root/target/linux/generic/hack-5.4/260-crypto_test_dependencies.patch +++ /dev/null @@ -1,52 +0,0 @@ -From fd1799b0bf5efa46dd3e6dfbbf3955564807e508 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:12:51 +0200 -Subject: kernel: prevent cryptomgr from pulling in useless extra dependencies for tests that are not run - -Reduces kernel size after LZMA by about 5k on MIPS - -lede-commit: 044c316167e076479a344c59905e5b435b84a77f -Signed-off-by: Felix Fietkau ---- - crypto/Kconfig | 13 ++++++------- - crypto/algboss.c | 4 ++++ - 2 files changed, 10 insertions(+), 7 deletions(-) - ---- a/crypto/Kconfig -+++ b/crypto/Kconfig -@@ -120,13 +120,13 @@ config CRYPTO_MANAGER - cbc(aes). - - config CRYPTO_MANAGER2 -- def_tristate CRYPTO_MANAGER || (CRYPTO_MANAGER!=n && CRYPTO_ALGAPI=y) -- select CRYPTO_AEAD2 -- select CRYPTO_HASH2 -- select CRYPTO_BLKCIPHER2 -- select CRYPTO_AKCIPHER2 -- select CRYPTO_KPP2 -- select CRYPTO_ACOMP2 -+ def_tristate CRYPTO_MANAGER || (CRYPTO_MANAGER!=n && CRYPTO_ALGAPI=y && !CRYPTO_MANAGER_DISABLE_TESTS) -+ select CRYPTO_AEAD2 if !CRYPTO_MANAGER_DISABLE_TESTS -+ select CRYPTO_HASH2 if !CRYPTO_MANAGER_DISABLE_TESTS -+ select CRYPTO_BLKCIPHER2 if !CRYPTO_MANAGER_DISABLE_TESTS -+ select CRYPTO_AKCIPHER2 if !CRYPTO_MANAGER_DISABLE_TESTS -+ select CRYPTO_KPP2 if !CRYPTO_MANAGER_DISABLE_TESTS -+ select CRYPTO_ACOMP2 if !CRYPTO_MANAGER_DISABLE_TESTS - - config CRYPTO_USER - tristate "Userspace cryptographic algorithm configuration" ---- a/crypto/algboss.c -+++ b/crypto/algboss.c -@@ -240,8 +240,12 @@ static int cryptomgr_schedule_test(struc - type = alg->cra_flags; - - /* Do not test internal algorithms. */ -+#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS -+ type |= CRYPTO_ALG_TESTED; -+#else - if (type & CRYPTO_ALG_INTERNAL) - type |= CRYPTO_ALG_TESTED; -+#endif - - param->type = type; - diff --git a/root/target/linux/generic/hack-5.4/260-lib-arc4-unhide.patch b/root/target/linux/generic/hack-5.4/260-lib-arc4-unhide.patch deleted file mode 100755 index a7668acf..00000000 --- a/root/target/linux/generic/hack-5.4/260-lib-arc4-unhide.patch +++ /dev/null @@ -1,15 +0,0 @@ -This makes it possible to select CONFIG_CRYPTO_LIB_ARC4 directly. We -need this to be able to compile this into the kernel and make use of it -from backports. - ---- a/lib/crypto/Kconfig -+++ b/lib/crypto/Kconfig -@@ -6,7 +6,7 @@ config CRYPTO_LIB_AES - tristate - - config CRYPTO_LIB_ARC4 -- tristate -+ tristate "ARC4 cipher library" - - config CRYPTO_ARCH_HAVE_LIB_BLAKE2S - tristate diff --git a/root/target/linux/generic/hack-5.4/280-rfkill-stubs.patch b/root/target/linux/generic/hack-5.4/280-rfkill-stubs.patch deleted file mode 100755 index 2e48aea1..00000000 --- a/root/target/linux/generic/hack-5.4/280-rfkill-stubs.patch +++ /dev/null @@ -1,84 +0,0 @@ -From 236c1acdfef5958010ac9814a9872e0a46fd78ee Mon Sep 17 00:00:00 2001 -From: John Crispin -Date: Fri, 7 Jul 2017 17:13:44 +0200 -Subject: rfkill: add fake rfkill support - -allow building of modules depending on RFKILL even if RFKILL is not enabled. - -Signed-off-by: John Crispin ---- - include/linux/rfkill.h | 2 +- - net/Makefile | 2 +- - net/rfkill/Kconfig | 14 +++++++++----- - net/rfkill/Makefile | 2 +- - 4 files changed, 12 insertions(+), 8 deletions(-) - ---- a/include/linux/rfkill.h -+++ b/include/linux/rfkill.h -@@ -64,7 +64,7 @@ struct rfkill_ops { - int (*set_block)(void *data, bool blocked); - }; - --#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE) -+#if defined(CONFIG_RFKILL_FULL) || defined(CONFIG_RFKILL_FULL_MODULE) - /** - * rfkill_alloc - Allocate rfkill structure - * @name: name of the struct -- the string is not copied internally ---- a/net/Makefile -+++ b/net/Makefile -@@ -53,7 +53,7 @@ obj-$(CONFIG_TIPC) += tipc/ - obj-$(CONFIG_NETLABEL) += netlabel/ - obj-$(CONFIG_IUCV) += iucv/ - obj-$(CONFIG_SMC) += smc/ --obj-$(CONFIG_RFKILL) += rfkill/ -+obj-$(CONFIG_RFKILL_FULL) += rfkill/ - obj-$(CONFIG_NET_9P) += 9p/ - obj-$(CONFIG_CAIF) += caif/ - ifneq ($(CONFIG_DCB),) ---- a/net/rfkill/Kconfig -+++ b/net/rfkill/Kconfig -@@ -2,7 +2,11 @@ - # - # RF switch subsystem configuration - # --menuconfig RFKILL -+config RFKILL -+ bool -+ default y -+ -+menuconfig RFKILL_FULL - tristate "RF switch subsystem support" - help - Say Y here if you want to have control over RF switches -@@ -14,19 +18,19 @@ menuconfig RFKILL - # LED trigger support - config RFKILL_LEDS - bool -- depends on RFKILL -+ depends on RFKILL_FULL - depends on LEDS_TRIGGERS = y || RFKILL = LEDS_TRIGGERS - default y - - config RFKILL_INPUT - bool "RF switch input support" if EXPERT -- depends on RFKILL -+ depends on RFKILL_FULL - depends on INPUT = y || RFKILL = INPUT - default y if !EXPERT - - config RFKILL_GPIO - tristate "GPIO RFKILL driver" -- depends on RFKILL -+ depends on RFKILL_FULL - depends on GPIOLIB || COMPILE_TEST - default n - help ---- a/net/rfkill/Makefile -+++ b/net/rfkill/Makefile -@@ -5,5 +5,5 @@ - - rfkill-y += core.o - rfkill-$(CONFIG_RFKILL_INPUT) += input.o --obj-$(CONFIG_RFKILL) += rfkill.o -+obj-$(CONFIG_RFKILL_FULL) += rfkill.o - obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o diff --git a/root/target/linux/generic/hack-5.4/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/root/target/linux/generic/hack-5.4/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch deleted file mode 100755 index aed08a5e..00000000 --- a/root/target/linux/generic/hack-5.4/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch +++ /dev/null @@ -1,64 +0,0 @@ -From: Ben Menchaca -Date: Fri, 7 Jun 2013 18:35:22 -0500 -Subject: MIPS: r4k_cache: use more efficient cache blast - -Optimize the compiler output for larger cache blast cases that are -common for DMA-based networking. - -Signed-off-by: Ben Menchaca -Signed-off-by: Felix Fietkau ---- ---- a/arch/mips/include/asm/r4kcache.h -+++ b/arch/mips/include/asm/r4kcache.h -@@ -617,14 +617,46 @@ static inline void prot##extra##blast_## - unsigned long end) \ - { \ - unsigned long lsize = cpu_##desc##_line_size(); \ -+ unsigned long lsize_2 = lsize * 2; \ -+ unsigned long lsize_3 = lsize * 3; \ -+ unsigned long lsize_4 = lsize * 4; \ -+ unsigned long lsize_5 = lsize * 5; \ -+ unsigned long lsize_6 = lsize * 6; \ -+ unsigned long lsize_7 = lsize * 7; \ -+ unsigned long lsize_8 = lsize * 8; \ - unsigned long addr = start & ~(lsize - 1); \ -- unsigned long aend = (end - 1) & ~(lsize - 1); \ -+ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \ -+ int lines = (aend - addr) / lsize; \ - \ -- while (1) { \ -+ while (lines >= 8) { \ -+ prot##cache_op(hitop, addr); \ -+ prot##cache_op(hitop, addr + lsize); \ -+ prot##cache_op(hitop, addr + lsize_2); \ -+ prot##cache_op(hitop, addr + lsize_3); \ -+ prot##cache_op(hitop, addr + lsize_4); \ -+ prot##cache_op(hitop, addr + lsize_5); \ -+ prot##cache_op(hitop, addr + lsize_6); \ -+ prot##cache_op(hitop, addr + lsize_7); \ -+ addr += lsize_8; \ -+ lines -= 8; \ -+ } \ -+ \ -+ if (lines & 0x4) { \ -+ prot##cache_op(hitop, addr); \ -+ prot##cache_op(hitop, addr + lsize); \ -+ prot##cache_op(hitop, addr + lsize_2); \ -+ prot##cache_op(hitop, addr + lsize_3); \ -+ addr += lsize_4; \ -+ } \ -+ \ -+ if (lines & 0x2) { \ -+ prot##cache_op(hitop, addr); \ -+ prot##cache_op(hitop, addr + lsize); \ -+ addr += lsize_2; \ -+ } \ -+ \ -+ if (lines & 0x1) { \ - prot##cache_op(hitop, addr); \ -- if (addr == aend) \ -- break; \ -- addr += lsize; \ - } \ - } - diff --git a/root/target/linux/generic/hack-5.4/301-mips_image_cmdline_hack.patch b/root/target/linux/generic/hack-5.4/301-mips_image_cmdline_hack.patch deleted file mode 100755 index ddae75f6..00000000 --- a/root/target/linux/generic/hack-5.4/301-mips_image_cmdline_hack.patch +++ /dev/null @@ -1,38 +0,0 @@ -From: John Crispin -Subject: hack: kernel: add generic image_cmdline hack to MIPS targets - -lede-commit: d59f5b3a987a48508257a0ddbaeadc7909f9f976 -Signed-off-by: Gabor Juhos ---- - arch/mips/Kconfig | 4 ++++ - arch/mips/kernel/head.S | 6 ++++++ - 2 files changed, 10 insertions(+) - ---- a/arch/mips/Kconfig -+++ b/arch/mips/Kconfig -@@ -1164,6 +1164,10 @@ config SYNC_R4K - config MIPS_MACHINE - def_bool n - -+config IMAGE_CMDLINE_HACK -+ bool "OpenWrt specific image command line hack" -+ default n -+ - config NO_IOPORT_MAP - def_bool n - ---- a/arch/mips/kernel/head.S -+++ b/arch/mips/kernel/head.S -@@ -79,6 +79,12 @@ FEXPORT(__kernel_entry) - j kernel_entry - #endif /* CONFIG_BOOT_RAW */ - -+#ifdef CONFIG_IMAGE_CMDLINE_HACK -+ .ascii "CMDLINE:" -+EXPORT(__image_cmdline) -+ .fill 0x400 -+#endif /* CONFIG_IMAGE_CMDLINE_HACK */ -+ - __REF - - NESTED(kernel_entry, 16, sp) # kernel entry point diff --git a/root/target/linux/generic/hack-5.4/321-powerpc_crtsavres_prereq.patch b/root/target/linux/generic/hack-5.4/321-powerpc_crtsavres_prereq.patch deleted file mode 100755 index 8591705e..00000000 --- a/root/target/linux/generic/hack-5.4/321-powerpc_crtsavres_prereq.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 107c0964cb8db7ca28ac5199426414fdab3c274d Mon Sep 17 00:00:00 2001 -From: "Alexandros C. Couloumbis" -Date: Fri, 7 Jul 2017 17:14:51 +0200 -Subject: hack: arch: powerpc: drop register save/restore library from modules - -Upstream GCC uses a libgcc function for saving/restoring registers. This -makes the code bigger, and upstream kernels need to carry that function -for every single kernel module. Our GCC is patched to avoid those -references, so we can drop the extra bloat for modules. - -lede-commit: e8e1084654f50904e6bf77b70b2de3f137d7b3ec -Signed-off-by: Alexandros C. Couloumbis ---- - arch/powerpc/Makefile | 1 - - 1 file changed, 1 deletion(-) - ---- a/arch/powerpc/Makefile -+++ b/arch/powerpc/Makefile -@@ -61,20 +61,6 @@ machine-$(CONFIG_PPC64) += 64 - machine-$(CONFIG_CPU_LITTLE_ENDIAN) += le - UTS_MACHINE := $(subst $(space),,$(machine-y)) - --# XXX This needs to be before we override LD below --ifdef CONFIG_PPC32 --KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o --else --KBUILD_LDS_MODULE += $(srctree)/arch/powerpc/kernel/module.lds --ifeq ($(call ld-ifversion, -ge, 225000000, y),y) --# Have the linker provide sfpr if possible. --# There is a corresponding test in arch/powerpc/lib/Makefile --KBUILD_LDFLAGS_MODULE += --save-restore-funcs --else --KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o --endif --endif -- - ifdef CONFIG_CPU_LITTLE_ENDIAN - KBUILD_CFLAGS += -mlittle-endian - KBUILD_LDFLAGS += -EL diff --git a/root/target/linux/generic/hack-5.4/400-block-fit-partition-parser.patch b/root/target/linux/generic/hack-5.4/400-block-fit-partition-parser.patch deleted file mode 100755 index 6b3267ef..00000000 --- a/root/target/linux/generic/hack-5.4/400-block-fit-partition-parser.patch +++ /dev/null @@ -1,176 +0,0 @@ ---- a/block/partitions/Kconfig -+++ b/block/partitions/Kconfig -@@ -101,6 +101,13 @@ config ATARI_PARTITION - Say Y here if you would like to use hard disks under Linux which - were partitioned under the Atari OS. - -+config FIT_PARTITION -+ bool "Flattened-Image-Tree (FIT) partition support" if PARTITION_ADVANCED -+ default n -+ help -+ Say Y here if your system needs to mount the filesystem part of -+ a Flattened-Image-Tree (FIT) image commonly used with Das U-Boot. -+ - config IBM_PARTITION - bool "IBM disk label and partition support" - depends on PARTITION_ADVANCED && S390 ---- a/block/partitions/Makefile -+++ b/block/partitions/Makefile -@@ -9,6 +9,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o - obj-$(CONFIG_AMIGA_PARTITION) += amiga.o - obj-$(CONFIG_ATARI_PARTITION) += atari.o - obj-$(CONFIG_AIX_PARTITION) += aix.o -+obj-$(CONFIG_FIT_PARTITION) += fit.o - obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o - obj-$(CONFIG_MAC_PARTITION) += mac.o - obj-$(CONFIG_LDM_PARTITION) += ldm.o ---- a/drivers/mtd/ubi/block.c -+++ b/drivers/mtd/ubi/block.c -@@ -396,7 +396,7 @@ int ubiblock_create(struct ubi_volume_in - dev->leb_size = vi->usable_leb_size; - - /* Initialize the gendisk of this ubiblock device */ -- gd = alloc_disk(1); -+ gd = alloc_disk(0); - if (!gd) { - pr_err("UBI: block: alloc_disk failed\n"); - ret = -ENODEV; -@@ -413,6 +413,7 @@ int ubiblock_create(struct ubi_volume_in - goto out_put_disk; - } - gd->private_data = dev; -+ gd->flags |= GENHD_FL_EXT_DEVT; - sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id); - set_capacity(gd, disk_capacity); - dev->gd = gd; ---- a/block/partition-generic.c -+++ b/block/partition-generic.c -@@ -18,6 +18,10 @@ - #include - #include - #include -+#ifdef CONFIG_FIT_PARTITION -+#include -+#endif -+ - - #include "partitions/check.h" - -@@ -180,6 +184,18 @@ ssize_t part_fail_store(struct device *d - } - #endif - -+static ssize_t part_name_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct hd_struct *p = dev_to_part(dev); -+ -+ if (p->info && p->info->volname) -+ return sprintf(buf, "%s\n", p->info->volname); -+ -+ buf[0] = '\0'; -+ return 0; -+} -+ - static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); - static DEVICE_ATTR(start, 0444, part_start_show, NULL); - static DEVICE_ATTR(size, 0444, part_size_show, NULL); -@@ -188,6 +204,7 @@ static DEVICE_ATTR(alignment_offset, 044 - static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); - static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); - static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); -+static DEVICE_ATTR(name, 0444, part_name_show, NULL); - #ifdef CONFIG_FAIL_MAKE_REQUEST - static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -@@ -202,6 +219,7 @@ static struct attribute *part_attrs[] = - &dev_attr_discard_alignment.attr, - &dev_attr_stat.attr, - &dev_attr_inflight.attr, -+ &dev_attr_name.attr, - #ifdef CONFIG_FAIL_MAKE_REQUEST - &dev_attr_fail.attr, - #endif -@@ -634,6 +652,10 @@ rescan: - if (state->parts[p].flags & ADDPART_FLAG_RAID) - md_autodetect_dev(part_to_dev(part)->devt); - #endif -+#ifdef CONFIG_FIT_PARTITION -+ if ((state->parts[p].flags & ADDPART_FLAG_ROOTDEV) && ROOT_DEV == 0) -+ ROOT_DEV = part_to_dev(part)->devt; -+#endif - } - free_partitions(state); - return 0; ---- a/block/partitions/check.c -+++ b/block/partitions/check.c -@@ -33,6 +33,7 @@ - #include "ibm.h" - #include "ultrix.h" - #include "efi.h" -+#include "fit.h" - #include "karma.h" - #include "sysv68.h" - #include "cmdline.h" -@@ -73,6 +74,9 @@ static int (*check_part[])(struct parsed - #ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ - #endif -+#ifdef CONFIG_FIT_PARTITION -+ fit_partition, -+#endif - #ifdef CONFIG_SGI_PARTITION - sgi_partition, - #endif ---- a/include/linux/genhd.h -+++ b/include/linux/genhd.h -@@ -614,6 +614,7 @@ struct unixware_disklabel { - #define ADDPART_FLAG_NONE 0 - #define ADDPART_FLAG_RAID 1 - #define ADDPART_FLAG_WHOLEDISK 2 -+#define ADDPART_FLAG_ROOTDEV 4 - - extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt); - extern void blk_free_devt(dev_t devt); ---- /dev/null -+++ b/block/partitions/fit.h -@@ -0,0 +1,3 @@ -+/* SPDX-License-Identifier: GPL-2.0-only */ -+int fit_partition(struct parsed_partitions *); -+int parse_fit_partitions(struct parsed_partitions *state, u64 start_sector, u64 nr_sectors, int *slot, int add_remain); ---- a/block/partitions/efi.c -+++ b/block/partitions/efi.c -@@ -681,6 +681,9 @@ int efi_partition(struct parsed_partitio - gpt_entry *ptes = NULL; - u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; -+#ifdef CONFIG_FIT_PARTITION -+ u32 extra_slot = 64; -+#endif - - if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { - kfree(gpt); -@@ -722,6 +725,11 @@ int efi_partition(struct parsed_partitio - label_count++; - } - state->parts[i + 1].has_info = true; -+#ifdef CONFIG_FIT_PARTITION -+ /* If this is a U-Boot FIT volume it may have subpartitions */ -+ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_FIT_GUID)) -+ (void) parse_fit_partitions(state, start * ssz, size * ssz, &extra_slot, 1); -+#endif - } - kfree(ptes); - kfree(gpt); ---- a/block/partitions/efi.h -+++ b/block/partitions/efi.h -@@ -52,6 +52,9 @@ - #define PARTITION_LINUX_LVM_GUID \ - EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ - 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) -+#define PARTITION_LINUX_FIT_GUID \ -+ EFI_GUID( 0xcae9be83, 0xb15f, 0x49cc, \ -+ 0x86, 0x3f, 0x08, 0x1b, 0x74, 0x4a, 0x2d, 0x93) - - typedef struct _gpt_header { - __le64 signature; diff --git a/root/target/linux/generic/hack-5.4/400-unlock_mx25l6406e_with_4bit_block_protect.patch b/root/target/linux/generic/hack-5.4/400-unlock_mx25l6406e_with_4bit_block_protect.patch deleted file mode 100755 index 8112fa7e..00000000 --- a/root/target/linux/generic/hack-5.4/400-unlock_mx25l6406e_with_4bit_block_protect.patch +++ /dev/null @@ -1,69 +0,0 @@ ---- a/drivers/mtd/spi-nor/spi-nor.c -+++ b/drivers/mtd/spi-nor/spi-nor.c -@@ -196,7 +196,7 @@ struct flash_info { - u16 page_size; - u16 addr_width; - -- u16 flags; -+ u32 flags; - #define SECT_4K BIT(0) /* SPINOR_OP_BE_4K works uniformly */ - #define SPI_NOR_NO_ERASE BIT(1) /* No erase command needed */ - #define SST_WRITE BIT(2) /* use SST byte programming */ -@@ -233,6 +233,10 @@ struct flash_info { - #define SPI_NOR_SKIP_SFDP BIT(13) /* Skip parsing of SFDP tables */ - #define USE_CLSR BIT(14) /* use CLSR command */ - #define SPI_NOR_OCTAL_READ BIT(15) /* Flash supports Octal Read */ -+#define SPI_NOR_4BIT_BP BIT(17) /* -+ * Flash SR has 4 bit fields (BP0-3) -+ * for block protection. -+ */ - - /* Part specific fixup hooks. */ - const struct spi_nor_fixups *fixups; -@@ -1985,6 +1989,9 @@ static int spi_nor_clear_sr_bp(struct sp - int ret; - u8 mask = SR_BP2 | SR_BP1 | SR_BP0; - -+ if (nor->flags & SNOR_F_HAS_4BIT_BP) -+ mask |= SR_BP3; -+ - ret = read_sr(nor); - if (ret < 0) { - dev_err(nor->dev, "error while reading status register\n"); -@@ -2338,7 +2345,7 @@ static const struct flash_info spi_nor_i - { "mx25l1606e", INFO(0xc22015, 0, 64 * 1024, 32, SECT_4K) }, - { "mx25l3205d", INFO(0xc22016, 0, 64 * 1024, 64, SECT_4K) }, - { "mx25l3255e", INFO(0xc29e16, 0, 64 * 1024, 64, SECT_4K) }, -- { "mx25l6405d", INFO(0xc22017, 0, 64 * 1024, 128, SECT_4K) }, -+ { "mx25l6405d", INFO(0xc22017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_4BIT_BP) }, - { "mx25u2033e", INFO(0xc22532, 0, 64 * 1024, 4, SECT_4K) }, - { "mx25u3235f", INFO(0xc22536, 0, 64 * 1024, 64, - SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) }, -@@ -5026,6 +5033,9 @@ int spi_nor_scan(struct spi_nor *nor, co - if (info->flags & USE_CLSR) - nor->flags |= SNOR_F_USE_CLSR; - -+ if (info->flags & SPI_NOR_4BIT_BP) -+ nor->flags |= SNOR_F_HAS_4BIT_BP; -+ - if (info->flags & SPI_NOR_NO_ERASE) - mtd->flags |= MTD_NO_ERASE; - ---- a/include/linux/mtd/spi-nor.h -+++ b/include/linux/mtd/spi-nor.h -@@ -127,6 +127,7 @@ - #define SR_BP0 BIT(2) /* Block protect 0 */ - #define SR_BP1 BIT(3) /* Block protect 1 */ - #define SR_BP2 BIT(4) /* Block protect 2 */ -+#define SR_BP3 BIT(5) /* Block protect 3 */ - #define SR_TB BIT(5) /* Top/Bottom protect */ - #define SR_SRWD BIT(7) /* SR write protect */ - /* Spansion/Cypress specific status bits */ -@@ -243,6 +244,7 @@ enum spi_nor_option_flags { - SNOR_F_4B_OPCODES = BIT(6), - SNOR_F_HAS_4BAIT = BIT(7), - SNOR_F_HAS_LOCK = BIT(8), -+ SNOR_F_HAS_4BIT_BP = BIT(12), - }; - - /** diff --git a/root/target/linux/generic/hack-5.4/531-debloat_lzma.patch b/root/target/linux/generic/hack-5.4/531-debloat_lzma.patch deleted file mode 100755 index 2f70eee3..00000000 --- a/root/target/linux/generic/hack-5.4/531-debloat_lzma.patch +++ /dev/null @@ -1,1040 +0,0 @@ -From 3fd297761ac246c54d7723c57fca95c112b99465 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 15 Jul 2017 21:15:44 +0200 -Subject: lzma: de-bloat the lzma library used by jffs2 - -lede-commit: 3fd1dd08fbcbb78b34efefd32c3032e5c99108d6 -Signed-off-by: Felix Fietkau ---- - include/linux/lzma/LzFind.h | 17 --- - include/linux/lzma/LzmaDec.h | 101 --------------- - include/linux/lzma/LzmaEnc.h | 20 --- - lib/lzma/LzFind.c | 287 ++++--------------------------------------- - lib/lzma/LzmaDec.c | 86 +------------ - lib/lzma/LzmaEnc.c | 172 ++------------------------ - 6 files changed, 42 insertions(+), 641 deletions(-) - ---- a/include/linux/lzma/LzFind.h -+++ b/include/linux/lzma/LzFind.h -@@ -55,11 +55,6 @@ typedef struct _CMatchFinder - - #define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) - --int MatchFinder_NeedMove(CMatchFinder *p); --Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); --void MatchFinder_MoveBlock(CMatchFinder *p); --void MatchFinder_ReadIfRequired(CMatchFinder *p); -- - void MatchFinder_Construct(CMatchFinder *p); - - /* Conditions: -@@ -70,12 +65,6 @@ int MatchFinder_Create(CMatchFinder *p, - UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, - ISzAlloc *alloc); - void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc); --void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, UInt32 numItems); --void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); -- --UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *buffer, CLzRef *son, -- UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, -- UInt32 *distances, UInt32 maxLen); - - /* - Conditions: -@@ -102,12 +91,6 @@ typedef struct _IMatchFinder - - void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable); - --void MatchFinder_Init(CMatchFinder *p); --UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); --UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); --void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); --void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); -- - #ifdef __cplusplus - } - #endif ---- a/include/linux/lzma/LzmaDec.h -+++ b/include/linux/lzma/LzmaDec.h -@@ -31,14 +31,6 @@ typedef struct _CLzmaProps - UInt32 dicSize; - } CLzmaProps; - --/* LzmaProps_Decode - decodes properties --Returns: -- SZ_OK -- SZ_ERROR_UNSUPPORTED - Unsupported properties --*/ -- --SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size); -- - - /* ---------- LZMA Decoder state ---------- */ - -@@ -70,8 +62,6 @@ typedef struct - - #define LzmaDec_Construct(p) { (p)->dic = 0; (p)->probs = 0; } - --void LzmaDec_Init(CLzmaDec *p); -- - /* There are two types of LZMA streams: - 0) Stream with end mark. That end mark adds about 6 bytes to compressed size. - 1) Stream without end mark. You must know exact uncompressed size to decompress such stream. */ -@@ -108,97 +98,6 @@ typedef enum - - /* ELzmaStatus is used only as output value for function call */ - -- --/* ---------- Interfaces ---------- */ -- --/* There are 3 levels of interfaces: -- 1) Dictionary Interface -- 2) Buffer Interface -- 3) One Call Interface -- You can select any of these interfaces, but don't mix functions from different -- groups for same object. */ -- -- --/* There are two variants to allocate state for Dictionary Interface: -- 1) LzmaDec_Allocate / LzmaDec_Free -- 2) LzmaDec_AllocateProbs / LzmaDec_FreeProbs -- You can use variant 2, if you set dictionary buffer manually. -- For Buffer Interface you must always use variant 1. -- --LzmaDec_Allocate* can return: -- SZ_OK -- SZ_ERROR_MEM - Memory allocation error -- SZ_ERROR_UNSUPPORTED - Unsupported properties --*/ -- --SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAlloc *alloc); --void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc); -- --SRes LzmaDec_Allocate(CLzmaDec *state, const Byte *prop, unsigned propsSize, ISzAlloc *alloc); --void LzmaDec_Free(CLzmaDec *state, ISzAlloc *alloc); -- --/* ---------- Dictionary Interface ---------- */ -- --/* You can use it, if you want to eliminate the overhead for data copying from -- dictionary to some other external buffer. -- You must work with CLzmaDec variables directly in this interface. -- -- STEPS: -- LzmaDec_Constr() -- LzmaDec_Allocate() -- for (each new stream) -- { -- LzmaDec_Init() -- while (it needs more decompression) -- { -- LzmaDec_DecodeToDic() -- use data from CLzmaDec::dic and update CLzmaDec::dicPos -- } -- } -- LzmaDec_Free() --*/ -- --/* LzmaDec_DecodeToDic -- -- The decoding to internal dictionary buffer (CLzmaDec::dic). -- You must manually update CLzmaDec::dicPos, if it reaches CLzmaDec::dicBufSize !!! -- --finishMode: -- It has meaning only if the decoding reaches output limit (dicLimit). -- LZMA_FINISH_ANY - Decode just dicLimit bytes. -- LZMA_FINISH_END - Stream must be finished after dicLimit. -- --Returns: -- SZ_OK -- status: -- LZMA_STATUS_FINISHED_WITH_MARK -- LZMA_STATUS_NOT_FINISHED -- LZMA_STATUS_NEEDS_MORE_INPUT -- LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK -- SZ_ERROR_DATA - Data error --*/ -- --SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, -- const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status); -- -- --/* ---------- Buffer Interface ---------- */ -- --/* It's zlib-like interface. -- See LzmaDec_DecodeToDic description for information about STEPS and return results, -- but you must use LzmaDec_DecodeToBuf instead of LzmaDec_DecodeToDic and you don't need -- to work with CLzmaDec variables manually. -- --finishMode: -- It has meaning only if the decoding reaches output limit (*destLen). -- LZMA_FINISH_ANY - Decode just destLen bytes. -- LZMA_FINISH_END - Stream must be finished after (*destLen). --*/ -- --SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, -- const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status); -- -- - /* ---------- One Call Interface ---------- */ - - /* LzmaDecode ---- a/include/linux/lzma/LzmaEnc.h -+++ b/include/linux/lzma/LzmaEnc.h -@@ -31,9 +31,6 @@ typedef struct _CLzmaEncProps - } CLzmaEncProps; - - void LzmaEncProps_Init(CLzmaEncProps *p); --void LzmaEncProps_Normalize(CLzmaEncProps *p); --UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2); -- - - /* ---------- CLzmaEncHandle Interface ---------- */ - -@@ -53,26 +50,9 @@ CLzmaEncHandle LzmaEnc_Create(ISzAlloc * - void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig); - SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props); - SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *properties, SizeT *size); --SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, -- ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); - SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, - int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); - --/* ---------- One Call Interface ---------- */ -- --/* LzmaEncode --Return code: -- SZ_OK - OK -- SZ_ERROR_MEM - Memory allocation error -- SZ_ERROR_PARAM - Incorrect paramater -- SZ_ERROR_OUTPUT_EOF - output buffer overflow -- SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) --*/ -- --SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, -- const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, -- ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); -- - #ifdef __cplusplus - } - #endif ---- a/lib/lzma/LzFind.c -+++ b/lib/lzma/LzFind.c -@@ -14,9 +14,15 @@ - - #define kStartMaxLen 3 - -+#if 0 -+#define DIRECT_INPUT p->directInput -+#else -+#define DIRECT_INPUT 1 -+#endif -+ - static void LzInWindow_Free(CMatchFinder *p, ISzAlloc *alloc) - { -- if (!p->directInput) -+ if (!DIRECT_INPUT) - { - alloc->Free(alloc, p->bufferBase); - p->bufferBase = 0; -@@ -28,7 +34,7 @@ static void LzInWindow_Free(CMatchFinder - static int LzInWindow_Create(CMatchFinder *p, UInt32 keepSizeReserv, ISzAlloc *alloc) - { - UInt32 blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv; -- if (p->directInput) -+ if (DIRECT_INPUT) - { - p->blockSize = blockSize; - return 1; -@@ -42,12 +48,12 @@ static int LzInWindow_Create(CMatchFinde - return (p->bufferBase != 0); - } - --Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } --Byte MatchFinder_GetIndexByte(CMatchFinder *p, Int32 index) { return p->buffer[index]; } -+static Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } -+static Byte MatchFinder_GetIndexByte(CMatchFinder *p, Int32 index) { return p->buffer[index]; } - --UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } -+static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } - --void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue) -+static void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue) - { - p->posLimit -= subValue; - p->pos -= subValue; -@@ -58,7 +64,7 @@ static void MatchFinder_ReadBlock(CMatch - { - if (p->streamEndWasReached || p->result != SZ_OK) - return; -- if (p->directInput) -+ if (DIRECT_INPUT) - { - UInt32 curSize = 0xFFFFFFFF - p->streamPos; - if (curSize > p->directInputRem) -@@ -89,7 +95,7 @@ static void MatchFinder_ReadBlock(CMatch - } - } - --void MatchFinder_MoveBlock(CMatchFinder *p) -+static void MatchFinder_MoveBlock(CMatchFinder *p) - { - memmove(p->bufferBase, - p->buffer - p->keepSizeBefore, -@@ -97,22 +103,14 @@ void MatchFinder_MoveBlock(CMatchFinder - p->buffer = p->bufferBase + p->keepSizeBefore; - } - --int MatchFinder_NeedMove(CMatchFinder *p) -+static int MatchFinder_NeedMove(CMatchFinder *p) - { -- if (p->directInput) -+ if (DIRECT_INPUT) - return 0; - /* if (p->streamEndWasReached) return 0; */ - return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); - } - --void MatchFinder_ReadIfRequired(CMatchFinder *p) --{ -- if (p->streamEndWasReached) -- return; -- if (p->keepSizeAfter >= p->streamPos - p->pos) -- MatchFinder_ReadBlock(p); --} -- - static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p) - { - if (MatchFinder_NeedMove(p)) -@@ -268,7 +266,7 @@ static void MatchFinder_SetLimits(CMatch - p->posLimit = p->pos + limit; - } - --void MatchFinder_Init(CMatchFinder *p) -+static void MatchFinder_Init(CMatchFinder *p) - { - UInt32 i; - for (i = 0; i < p->hashSizeSum; i++) -@@ -287,7 +285,7 @@ static UInt32 MatchFinder_GetSubValue(CM - return (p->pos - p->historySize - 1) & kNormalizeMask; - } - --void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, UInt32 numItems) -+static void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, UInt32 numItems) - { - UInt32 i; - for (i = 0; i < numItems; i++) -@@ -319,38 +317,7 @@ static void MatchFinder_CheckLimits(CMat - MatchFinder_SetLimits(p); - } - --static UInt32 * Hc_GetMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, -- UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, -- UInt32 *distances, UInt32 maxLen) --{ -- son[_cyclicBufferPos] = curMatch; -- for (;;) -- { -- UInt32 delta = pos - curMatch; -- if (cutValue-- == 0 || delta >= _cyclicBufferSize) -- return distances; -- { -- const Byte *pb = cur - delta; -- curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; -- if (pb[maxLen] == cur[maxLen] && *pb == *cur) -- { -- UInt32 len = 0; -- while (++len != lenLimit) -- if (pb[len] != cur[len]) -- break; -- if (maxLen < len) -- { -- *distances++ = maxLen = len; -- *distances++ = delta - 1; -- if (len == lenLimit) -- return distances; -- } -- } -- } -- } --} -- --UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, -+static UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, UInt32 maxLen) - { -@@ -460,10 +427,10 @@ static void SkipMatchesSpec(UInt32 lenLi - p->buffer++; \ - if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p); - --#define MOVE_POS_RET MOVE_POS return offset; -- - static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } - -+#define MOVE_POS_RET MatchFinder_MovePos(p); return offset; -+ - #define GET_MATCHES_HEADER2(minLen, ret_op) \ - UInt32 lenLimit; UInt32 hashValue; const Byte *cur; UInt32 curMatch; \ - lenLimit = p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ -@@ -479,62 +446,7 @@ static void MatchFinder_MovePos(CMatchFi - distances + offset, maxLen) - distances); MOVE_POS_RET; - - #define SKIP_FOOTER \ -- SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS; -- --static UInt32 Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) --{ -- UInt32 offset; -- GET_MATCHES_HEADER(2) -- HASH2_CALC; -- curMatch = p->hash[hashValue]; -- p->hash[hashValue] = p->pos; -- offset = 0; -- GET_MATCHES_FOOTER(offset, 1) --} -- --UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) --{ -- UInt32 offset; -- GET_MATCHES_HEADER(3) -- HASH_ZIP_CALC; -- curMatch = p->hash[hashValue]; -- p->hash[hashValue] = p->pos; -- offset = 0; -- GET_MATCHES_FOOTER(offset, 2) --} -- --static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) --{ -- UInt32 hash2Value, delta2, maxLen, offset; -- GET_MATCHES_HEADER(3) -- -- HASH3_CALC; -- -- delta2 = p->pos - p->hash[hash2Value]; -- curMatch = p->hash[kFix3HashSize + hashValue]; -- -- p->hash[hash2Value] = -- p->hash[kFix3HashSize + hashValue] = p->pos; -- -- -- maxLen = 2; -- offset = 0; -- if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) -- { -- for (; maxLen != lenLimit; maxLen++) -- if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) -- break; -- distances[0] = maxLen; -- distances[1] = delta2 - 1; -- offset = 2; -- if (maxLen == lenLimit) -- { -- SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); -- MOVE_POS_RET; -- } -- } -- GET_MATCHES_FOOTER(offset, maxLen) --} -+ SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); MatchFinder_MovePos(p); - - static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) - { -@@ -583,108 +495,6 @@ static UInt32 Bt4_MatchFinder_GetMatches - GET_MATCHES_FOOTER(offset, maxLen) - } - --static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) --{ -- UInt32 hash2Value, hash3Value, delta2, delta3, maxLen, offset; -- GET_MATCHES_HEADER(4) -- -- HASH4_CALC; -- -- delta2 = p->pos - p->hash[ hash2Value]; -- delta3 = p->pos - p->hash[kFix3HashSize + hash3Value]; -- curMatch = p->hash[kFix4HashSize + hashValue]; -- -- p->hash[ hash2Value] = -- p->hash[kFix3HashSize + hash3Value] = -- p->hash[kFix4HashSize + hashValue] = p->pos; -- -- maxLen = 1; -- offset = 0; -- if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) -- { -- distances[0] = maxLen = 2; -- distances[1] = delta2 - 1; -- offset = 2; -- } -- if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) -- { -- maxLen = 3; -- distances[offset + 1] = delta3 - 1; -- offset += 2; -- delta2 = delta3; -- } -- if (offset != 0) -- { -- for (; maxLen != lenLimit; maxLen++) -- if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) -- break; -- distances[offset - 2] = maxLen; -- if (maxLen == lenLimit) -- { -- p->son[p->cyclicBufferPos] = curMatch; -- MOVE_POS_RET; -- } -- } -- if (maxLen < 3) -- maxLen = 3; -- offset = (UInt32)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), -- distances + offset, maxLen) - (distances)); -- MOVE_POS_RET --} -- --UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) --{ -- UInt32 offset; -- GET_MATCHES_HEADER(3) -- HASH_ZIP_CALC; -- curMatch = p->hash[hashValue]; -- p->hash[hashValue] = p->pos; -- offset = (UInt32)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), -- distances, 2) - (distances)); -- MOVE_POS_RET --} -- --static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) --{ -- do -- { -- SKIP_HEADER(2) -- HASH2_CALC; -- curMatch = p->hash[hashValue]; -- p->hash[hashValue] = p->pos; -- SKIP_FOOTER -- } -- while (--num != 0); --} -- --void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) --{ -- do -- { -- SKIP_HEADER(3) -- HASH_ZIP_CALC; -- curMatch = p->hash[hashValue]; -- p->hash[hashValue] = p->pos; -- SKIP_FOOTER -- } -- while (--num != 0); --} -- --static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) --{ -- do -- { -- UInt32 hash2Value; -- SKIP_HEADER(3) -- HASH3_CALC; -- curMatch = p->hash[kFix3HashSize + hashValue]; -- p->hash[hash2Value] = -- p->hash[kFix3HashSize + hashValue] = p->pos; -- SKIP_FOOTER -- } -- while (--num != 0); --} -- - static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) - { - do -@@ -701,61 +511,12 @@ static void Bt4_MatchFinder_Skip(CMatchF - while (--num != 0); - } - --static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) --{ -- do -- { -- UInt32 hash2Value, hash3Value; -- SKIP_HEADER(4) -- HASH4_CALC; -- curMatch = p->hash[kFix4HashSize + hashValue]; -- p->hash[ hash2Value] = -- p->hash[kFix3HashSize + hash3Value] = -- p->hash[kFix4HashSize + hashValue] = p->pos; -- p->son[p->cyclicBufferPos] = curMatch; -- MOVE_POS -- } -- while (--num != 0); --} -- --void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) --{ -- do -- { -- SKIP_HEADER(3) -- HASH_ZIP_CALC; -- curMatch = p->hash[hashValue]; -- p->hash[hashValue] = p->pos; -- p->son[p->cyclicBufferPos] = curMatch; -- MOVE_POS -- } -- while (--num != 0); --} -- - void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) - { - vTable->Init = (Mf_Init_Func)MatchFinder_Init; - vTable->GetIndexByte = (Mf_GetIndexByte_Func)MatchFinder_GetIndexByte; - vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; - vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; -- if (!p->btMode) -- { -- vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; -- vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; -- } -- else if (p->numHashBytes == 2) -- { -- vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; -- vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; -- } -- else if (p->numHashBytes == 3) -- { -- vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; -- vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; -- } -- else -- { -- vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; -- vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; -- } -+ vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; -+ vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; - } ---- a/lib/lzma/LzmaDec.c -+++ b/lib/lzma/LzmaDec.c -@@ -682,7 +682,7 @@ static void LzmaDec_InitRc(CLzmaDec *p, - p->needFlush = 0; - } - --void LzmaDec_InitDicAndState(CLzmaDec *p, Bool initDic, Bool initState) -+static void LzmaDec_InitDicAndState(CLzmaDec *p, Bool initDic, Bool initState) - { - p->needFlush = 1; - p->remainLen = 0; -@@ -698,7 +698,7 @@ void LzmaDec_InitDicAndState(CLzmaDec *p - p->needInitState = 1; - } - --void LzmaDec_Init(CLzmaDec *p) -+static void LzmaDec_Init(CLzmaDec *p) - { - p->dicPos = 0; - LzmaDec_InitDicAndState(p, True, True); -@@ -716,7 +716,7 @@ static void LzmaDec_InitStateReal(CLzmaD - p->needInitState = 0; - } - --SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *srcLen, -+static SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *srcLen, - ELzmaFinishMode finishMode, ELzmaStatus *status) - { - SizeT inSize = *srcLen; -@@ -837,65 +837,13 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, Si - return (p->code == 0) ? SZ_OK : SZ_ERROR_DATA; - } - --SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) --{ -- SizeT outSize = *destLen; -- SizeT inSize = *srcLen; -- *srcLen = *destLen = 0; -- for (;;) -- { -- SizeT inSizeCur = inSize, outSizeCur, dicPos; -- ELzmaFinishMode curFinishMode; -- SRes res; -- if (p->dicPos == p->dicBufSize) -- p->dicPos = 0; -- dicPos = p->dicPos; -- if (outSize > p->dicBufSize - dicPos) -- { -- outSizeCur = p->dicBufSize; -- curFinishMode = LZMA_FINISH_ANY; -- } -- else -- { -- outSizeCur = dicPos + outSize; -- curFinishMode = finishMode; -- } -- -- res = LzmaDec_DecodeToDic(p, outSizeCur, src, &inSizeCur, curFinishMode, status); -- src += inSizeCur; -- inSize -= inSizeCur; -- *srcLen += inSizeCur; -- outSizeCur = p->dicPos - dicPos; -- memcpy(dest, p->dic + dicPos, outSizeCur); -- dest += outSizeCur; -- outSize -= outSizeCur; -- *destLen += outSizeCur; -- if (res != 0) -- return res; -- if (outSizeCur == 0 || outSize == 0) -- return SZ_OK; -- } --} -- --void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc) -+static void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc) - { - alloc->Free(alloc, p->probs); - p->probs = 0; - } - --static void LzmaDec_FreeDict(CLzmaDec *p, ISzAlloc *alloc) --{ -- alloc->Free(alloc, p->dic); -- p->dic = 0; --} -- --void LzmaDec_Free(CLzmaDec *p, ISzAlloc *alloc) --{ -- LzmaDec_FreeProbs(p, alloc); -- LzmaDec_FreeDict(p, alloc); --} -- --SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size) -+static SRes LzmaProps_Decode(CLzmaProps *p, const Byte *data, unsigned size) - { - UInt32 dicSize; - Byte d; -@@ -935,7 +883,7 @@ static SRes LzmaDec_AllocateProbs2(CLzma - return SZ_OK; - } - --SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAlloc *alloc) -+static SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAlloc *alloc) - { - CLzmaProps propNew; - RINOK(LzmaProps_Decode(&propNew, props, propsSize)); -@@ -943,28 +891,6 @@ SRes LzmaDec_AllocateProbs(CLzmaDec *p, - p->prop = propNew; - return SZ_OK; - } -- --SRes LzmaDec_Allocate(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAlloc *alloc) --{ -- CLzmaProps propNew; -- SizeT dicBufSize; -- RINOK(LzmaProps_Decode(&propNew, props, propsSize)); -- RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); -- dicBufSize = propNew.dicSize; -- if (p->dic == 0 || dicBufSize != p->dicBufSize) -- { -- LzmaDec_FreeDict(p, alloc); -- p->dic = (Byte *)alloc->Alloc(alloc, dicBufSize); -- if (p->dic == 0) -- { -- LzmaDec_FreeProbs(p, alloc); -- return SZ_ERROR_MEM; -- } -- } -- p->dicBufSize = dicBufSize; -- p->prop = propNew; -- return SZ_OK; --} - - SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, - const Byte *propData, unsigned propSize, ELzmaFinishMode finishMode, ---- a/lib/lzma/LzmaEnc.c -+++ b/lib/lzma/LzmaEnc.c -@@ -53,7 +53,7 @@ void LzmaEncProps_Init(CLzmaEncProps *p) - p->writeEndMark = 0; - } - --void LzmaEncProps_Normalize(CLzmaEncProps *p) -+static void LzmaEncProps_Normalize(CLzmaEncProps *p) - { - int level = p->level; - if (level < 0) level = 5; -@@ -76,7 +76,7 @@ void LzmaEncProps_Normalize(CLzmaEncProp - #endif - } - --UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) -+static UInt32 __maybe_unused LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) - { - CLzmaEncProps props = *props2; - LzmaEncProps_Normalize(&props); -@@ -93,7 +93,7 @@ UInt32 LzmaEncProps_GetDictSize(const CL - - #define BSR2_RET(pos, res) { unsigned long i; _BitScanReverse(&i, (pos)); res = (i + i) + ((pos >> (i - 1)) & 1); } - --UInt32 GetPosSlot1(UInt32 pos) -+static UInt32 GetPosSlot1(UInt32 pos) - { - UInt32 res; - BSR2_RET(pos, res); -@@ -107,7 +107,7 @@ UInt32 GetPosSlot1(UInt32 pos) - #define kNumLogBits (9 + (int)sizeof(size_t) / 2) - #define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7) - --void LzmaEnc_FastPosInit(Byte *g_FastPos) -+static void LzmaEnc_FastPosInit(Byte *g_FastPos) - { - int c = 2, slotFast; - g_FastPos[0] = 0; -@@ -339,58 +339,6 @@ typedef struct - CSaveState saveState; - } CLzmaEnc; - --void LzmaEnc_SaveState(CLzmaEncHandle pp) --{ -- CLzmaEnc *p = (CLzmaEnc *)pp; -- CSaveState *dest = &p->saveState; -- int i; -- dest->lenEnc = p->lenEnc; -- dest->repLenEnc = p->repLenEnc; -- dest->state = p->state; -- -- for (i = 0; i < kNumStates; i++) -- { -- memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i])); -- memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i])); -- } -- for (i = 0; i < kNumLenToPosStates; i++) -- memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i])); -- memcpy(dest->isRep, p->isRep, sizeof(p->isRep)); -- memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0)); -- memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1)); -- memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2)); -- memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); -- memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); -- memcpy(dest->reps, p->reps, sizeof(p->reps)); -- memcpy(dest->litProbs, p->litProbs, (0x300 << p->lclp) * sizeof(CLzmaProb)); --} -- --void LzmaEnc_RestoreState(CLzmaEncHandle pp) --{ -- CLzmaEnc *dest = (CLzmaEnc *)pp; -- const CSaveState *p = &dest->saveState; -- int i; -- dest->lenEnc = p->lenEnc; -- dest->repLenEnc = p->repLenEnc; -- dest->state = p->state; -- -- for (i = 0; i < kNumStates; i++) -- { -- memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i])); -- memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i])); -- } -- for (i = 0; i < kNumLenToPosStates; i++) -- memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i])); -- memcpy(dest->isRep, p->isRep, sizeof(p->isRep)); -- memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0)); -- memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1)); -- memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2)); -- memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); -- memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); -- memcpy(dest->reps, p->reps, sizeof(p->reps)); -- memcpy(dest->litProbs, p->litProbs, (0x300 << dest->lclp) * sizeof(CLzmaProb)); --} -- - SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) - { - CLzmaEnc *p = (CLzmaEnc *)pp; -@@ -600,7 +548,7 @@ static void LitEnc_EncodeMatched(CRangeE - while (symbol < 0x10000); - } - --void LzmaEnc_InitPriceTables(UInt32 *ProbPrices) -+static void LzmaEnc_InitPriceTables(UInt32 *ProbPrices) - { - UInt32 i; - for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits)) -@@ -1676,7 +1624,7 @@ static void FillDistancesPrices(CLzmaEnc - p->matchPriceCount = 0; - } - --void LzmaEnc_Construct(CLzmaEnc *p) -+static void LzmaEnc_Construct(CLzmaEnc *p) - { - RangeEnc_Construct(&p->rc); - MatchFinder_Construct(&p->matchFinderBase); -@@ -1709,7 +1657,7 @@ CLzmaEncHandle LzmaEnc_Create(ISzAlloc * - return p; - } - --void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAlloc *alloc) -+static void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAlloc *alloc) - { - alloc->Free(alloc, p->litProbs); - alloc->Free(alloc, p->saveState.litProbs); -@@ -1717,7 +1665,7 @@ void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAl - p->saveState.litProbs = 0; - } - --void LzmaEnc_Destruct(CLzmaEnc *p, ISzAlloc *alloc, ISzAlloc *allocBig) -+static void LzmaEnc_Destruct(CLzmaEnc *p, ISzAlloc *alloc, ISzAlloc *allocBig) - { - #ifndef _7ZIP_ST - MatchFinderMt_Destruct(&p->matchFinderMt, allocBig); -@@ -1947,7 +1895,7 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, U - return SZ_OK; - } - --void LzmaEnc_Init(CLzmaEnc *p) -+static void LzmaEnc_Init(CLzmaEnc *p) - { - UInt32 i; - p->state = 0; -@@ -2005,7 +1953,7 @@ void LzmaEnc_Init(CLzmaEnc *p) - p->lpMask = (1 << p->lp) - 1; - } - --void LzmaEnc_InitPrices(CLzmaEnc *p) -+static void LzmaEnc_InitPrices(CLzmaEnc *p) - { - if (!p->fastMode) - { -@@ -2037,26 +1985,6 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEn - return SZ_OK; - } - --static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, -- ISzAlloc *alloc, ISzAlloc *allocBig) --{ -- CLzmaEnc *p = (CLzmaEnc *)pp; -- p->matchFinderBase.stream = inStream; -- p->needInit = 1; -- p->rc.outStream = outStream; -- return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); --} -- --SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, -- ISeqInStream *inStream, UInt32 keepWindowSize, -- ISzAlloc *alloc, ISzAlloc *allocBig) --{ -- CLzmaEnc *p = (CLzmaEnc *)pp; -- p->matchFinderBase.stream = inStream; -- p->needInit = 1; -- return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); --} -- - static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const Byte *src, SizeT srcLen) - { - p->matchFinderBase.directInput = 1; -@@ -2064,7 +1992,7 @@ static void LzmaEnc_SetInputBuf(CLzmaEnc - p->matchFinderBase.directInputRem = srcLen; - } - --SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, -+static SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, - UInt32 keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) - { - CLzmaEnc *p = (CLzmaEnc *)pp; -@@ -2074,7 +2002,7 @@ SRes LzmaEnc_MemPrepare(CLzmaEncHandle p - return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); - } - --void LzmaEnc_Finish(CLzmaEncHandle pp) -+static void LzmaEnc_Finish(CLzmaEncHandle pp) - { - #ifndef _7ZIP_ST - CLzmaEnc *p = (CLzmaEnc *)pp; -@@ -2107,53 +2035,6 @@ static size_t MyWrite(void *pp, const vo - return size; - } - -- --UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp) --{ -- const CLzmaEnc *p = (CLzmaEnc *)pp; -- return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); --} -- --const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp) --{ -- const CLzmaEnc *p = (CLzmaEnc *)pp; -- return p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; --} -- --SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, Bool reInit, -- Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize) --{ -- CLzmaEnc *p = (CLzmaEnc *)pp; -- UInt64 nowPos64; -- SRes res; -- CSeqOutStreamBuf outStream; -- -- outStream.funcTable.Write = MyWrite; -- outStream.data = dest; -- outStream.rem = *destLen; -- outStream.overflow = False; -- -- p->writeEndMark = False; -- p->finished = False; -- p->result = SZ_OK; -- -- if (reInit) -- LzmaEnc_Init(p); -- LzmaEnc_InitPrices(p); -- nowPos64 = p->nowPos64; -- RangeEnc_Init(&p->rc); -- p->rc.outStream = &outStream.funcTable; -- -- res = LzmaEnc_CodeOneBlock(p, True, desiredPackSize, *unpackSize); -- -- *unpackSize = (UInt32)(p->nowPos64 - nowPos64); -- *destLen -= outStream.rem; -- if (outStream.overflow) -- return SZ_ERROR_OUTPUT_EOF; -- -- return res; --} -- - static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) - { - SRes res = SZ_OK; -@@ -2184,13 +2065,6 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, - return res; - } - --SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress, -- ISzAlloc *alloc, ISzAlloc *allocBig) --{ -- RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig)); -- return LzmaEnc_Encode2((CLzmaEnc *)pp, progress); --} -- - SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, Byte *props, SizeT *size) - { - CLzmaEnc *p = (CLzmaEnc *)pp; -@@ -2247,25 +2121,3 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp - return SZ_ERROR_OUTPUT_EOF; - return res; - } -- --SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, -- const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, -- ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig) --{ -- CLzmaEnc *p = (CLzmaEnc *)LzmaEnc_Create(alloc); -- SRes res; -- if (p == 0) -- return SZ_ERROR_MEM; -- -- res = LzmaEnc_SetProps(p, props); -- if (res == SZ_OK) -- { -- res = LzmaEnc_WriteProperties(p, propsEncoded, propsSize); -- if (res == SZ_OK) -- res = LzmaEnc_MemEncode(p, dest, destLen, src, srcLen, -- writeEndMark, progress, alloc, allocBig); -- } -- -- LzmaEnc_Destroy(p, alloc, allocBig); -- return res; --} diff --git a/root/target/linux/generic/hack-5.4/550-loop-Report-EOPNOTSUPP-properly.patch b/root/target/linux/generic/hack-5.4/550-loop-Report-EOPNOTSUPP-properly.patch deleted file mode 100755 index 0e5447d4..00000000 --- a/root/target/linux/generic/hack-5.4/550-loop-Report-EOPNOTSUPP-properly.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 2e864386e62e702a343be2507062ee08d5dfc810 Mon Sep 17 00:00:00 2001 -From: Evan Green -Date: Thu, 14 Nov 2019 15:50:07 -0800 -Subject: loop: Report EOPNOTSUPP properly - -Properly plumb out EOPNOTSUPP from loop driver operations, which may -get returned when for instance a discard operation is attempted but not -supported by the underlying block device. Before this change, everything -was reported in the log as an I/O error, which is scary and not -helpful in debugging. - -Signed-off-by: Evan Green -Reviewed-by: Gwendal Grignou -Reviewed-by: Bart Van Assche ---- - drivers/block/loop.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - ---- a/drivers/block/loop.c -+++ b/drivers/block/loop.c -@@ -462,7 +462,7 @@ static void lo_complete_rq(struct reques - if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || - req_op(rq) != REQ_OP_READ) { - if (cmd->ret < 0) -- ret = BLK_STS_IOERR; -+ ret = errno_to_blk_status(cmd->ret); - goto end_io; - } - -@@ -1973,7 +1973,10 @@ static void loop_handle_cmd(struct loop_ - failed: - /* complete non-aio request */ - if (!cmd->use_aio || ret) { -- cmd->ret = ret ? -EIO : 0; -+ if (ret == -EOPNOTSUPP) -+ cmd->ret = ret; -+ else -+ cmd->ret = ret ? -EIO : 0; - blk_mq_complete_request(rq); - } - } diff --git a/root/target/linux/generic/hack-5.4/640-bridge-only-accept-EAP-locally.patch b/root/target/linux/generic/hack-5.4/640-bridge-only-accept-EAP-locally.patch deleted file mode 100755 index a713aa3c..00000000 --- a/root/target/linux/generic/hack-5.4/640-bridge-only-accept-EAP-locally.patch +++ /dev/null @@ -1,82 +0,0 @@ -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:18:54 +0200 -Subject: bridge: only accept EAP locally - -When bridging, do not forward EAP frames to other ports, only deliver -them locally, regardless of the state. - -Signed-off-by: Felix Fietkau -[add disable_eap_hack sysfs attribute] -Signed-off-by: Etienne Champetier ---- - ---- a/net/bridge/br_input.c -+++ b/net/bridge/br_input.c -@@ -103,10 +103,14 @@ int br_handle_frame_finish(struct net *n - } - } - -+ BR_INPUT_SKB_CB(skb)->brdev = br->dev; -+ -+ if (skb->protocol == htons(ETH_P_PAE) && !br->disable_eap_hack) -+ return br_pass_frame_up(skb); -+ - if (p->state == BR_STATE_LEARNING) - goto drop; - -- BR_INPUT_SKB_CB(skb)->brdev = br->dev; - BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); - - if (IS_ENABLED(CONFIG_INET) && ---- a/net/bridge/br_private.h -+++ b/net/bridge/br_private.h -@@ -345,6 +345,8 @@ struct net_bridge { - u16 group_fwd_mask; - u16 group_fwd_mask_required; - -+ bool disable_eap_hack; -+ - /* STP */ - bridge_id designated_root; - bridge_id bridge_id; ---- a/net/bridge/br_sysfs_br.c -+++ b/net/bridge/br_sysfs_br.c -@@ -166,6 +166,30 @@ static ssize_t group_fwd_mask_store(stru - } - static DEVICE_ATTR_RW(group_fwd_mask); - -+static ssize_t disable_eap_hack_show(struct device *d, -+ struct device_attribute *attr, -+ char *buf) -+{ -+ struct net_bridge *br = to_bridge(d); -+ return sprintf(buf, "%u\n", br->disable_eap_hack); -+} -+ -+static int set_disable_eap_hack(struct net_bridge *br, unsigned long val) -+{ -+ br->disable_eap_hack = !!val; -+ -+ return 0; -+} -+ -+static ssize_t disable_eap_hack_store(struct device *d, -+ struct device_attribute *attr, -+ const char *buf, -+ size_t len) -+{ -+ return store_bridge_parm(d, buf, len, set_disable_eap_hack); -+} -+static DEVICE_ATTR_RW(disable_eap_hack); -+ - static ssize_t priority_show(struct device *d, struct device_attribute *attr, - char *buf) - { -@@ -851,6 +875,7 @@ static struct attribute *bridge_attrs[] - &dev_attr_ageing_time.attr, - &dev_attr_stp_state.attr, - &dev_attr_group_fwd_mask.attr, -+ &dev_attr_disable_eap_hack.attr, - &dev_attr_priority.attr, - &dev_attr_bridge_id.attr, - &dev_attr_root_id.attr, diff --git a/root/target/linux/generic/hack-5.4/645-netfilter-connmark-introduce-set-dscpmark.patch b/root/target/linux/generic/hack-5.4/645-netfilter-connmark-introduce-set-dscpmark.patch deleted file mode 100755 index 2d3fe01a..00000000 --- a/root/target/linux/generic/hack-5.4/645-netfilter-connmark-introduce-set-dscpmark.patch +++ /dev/null @@ -1,212 +0,0 @@ -From eda40b8c8c82e0f2789d6bc8bf63846dce2e8f32 Mon Sep 17 00:00:00 2001 -From: Kevin Darbyshire-Bryant -Date: Sat, 23 Mar 2019 09:29:49 +0000 -Subject: [PATCH] netfilter: connmark: introduce set-dscpmark - -set-dscpmark is a method of storing the DSCP of an ip packet into -conntrack mark. In combination with a suitable tc filter action -(act_ctinfo) DSCP values are able to be stored in the mark on egress and -restored on ingress across links that otherwise alter or bleach DSCP. - -This is useful for qdiscs such as CAKE which are able to shape according -to policies based on DSCP. - -Ingress classification is traditionally a challenging task since -iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT -lookups, hence are unable to see internal IPv4 addresses as used on the -typical home masquerading gateway. - -x_tables CONNMARK set-dscpmark target solves the problem of storing the -DSCP to the conntrack mark in a way suitable for the new act_ctinfo tc -action to restore. - -The set-dscpmark option accepts 2 parameters, a 32bit 'dscpmask' and a -32bit 'statemask'. The dscp mask must be 6 contiguous bits and -represents the area where the DSCP will be stored in the connmark. The -state mask is a minimum 1 bit length mask that must not overlap with the -dscpmask. It represents a flag which is set when the DSCP has been -stored in the conntrack mark. This is useful to implement a 'one shot' -iptables based classification where the 'complicated' iptables rules are -only run once to classify the connection on initial (egress) packet and -subsequent packets are all marked/restored with the same DSCP. A state -mask of zero disables the setting of a status bit/s. - -example syntax with a suitably modified iptables user space application: - -iptables -A QOS_MARK_eth0 -t mangle -j CONNMARK --set-dscpmark 0xfc000000/0x01000000 - -Would store the DSCP in the top 6 bits of the 32bit mark field, and use -the LSB of the top byte as the 'DSCP has been stored' marker. - -|----0xFC----conntrack mark----000000---| -| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0| -| DSCP | unused | flag |unused | -|-----------------------0x01---000000---| - ^ ^ - | | - ---| Conditional flag - | set this when dscp -|-ip diffserv-| stored in mark -| 6 bits | -|-------------| - -an identically configured tc action to restore looks like: - -tc filter show dev eth0 ingress -filter parent ffff: protocol all pref 10 u32 chain 0 -filter parent ffff: protocol all pref 10 u32 chain 0 fh 800: ht divisor 1 -filter parent ffff: protocol all pref 10 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1: not_in_hw - match 00000000/00000000 at 0 - action order 1: ctinfo zone 0 pipe - index 2 ref 1 bind 1 dscp 0xfc000000/0x1000000 - - action order 2: mirred (Egress Redirect to device ifb4eth0) stolen - index 1 ref 1 bind 1 - -|----0xFC----conntrack mark----000000---| -| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0| -| DSCP | unused | flag |unused | -|-----------------------0x01---000000---| - | | - | | - ---| Conditional flag - v only restore if set -|-ip diffserv-| -| 6 bits | -|-------------| - -Signed-off-by: Kevin Darbyshire-Bryant ---- - include/uapi/linux/netfilter/xt_connmark.h | 10 ++++ - net/netfilter/xt_connmark.c | 55 ++++++++++++++++++---- - 2 files changed, 57 insertions(+), 8 deletions(-) - ---- a/include/uapi/linux/netfilter/xt_connmark.h -+++ b/include/uapi/linux/netfilter/xt_connmark.h -@@ -20,6 +20,11 @@ enum { - }; - - enum { -+ XT_CONNMARK_VALUE = (1 << 0), -+ XT_CONNMARK_DSCP = (1 << 1) -+}; -+ -+enum { - D_SHIFT_LEFT = 0, - D_SHIFT_RIGHT, - }; -@@ -34,6 +39,11 @@ struct xt_connmark_tginfo2 { - __u8 shift_dir, shift_bits, mode; - }; - -+struct xt_connmark_tginfo3 { -+ __u32 ctmark, ctmask, nfmask; -+ __u8 shift_dir, shift_bits, mode, func; -+}; -+ - struct xt_connmark_mtinfo1 { - __u32 mark, mask; - __u8 invert; ---- a/net/netfilter/xt_connmark.c -+++ b/net/netfilter/xt_connmark.c -@@ -24,12 +24,13 @@ MODULE_ALIAS("ipt_connmark"); - MODULE_ALIAS("ip6t_connmark"); - - static unsigned int --connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) -+connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo3 *info) - { - enum ip_conntrack_info ctinfo; - u_int32_t new_targetmark; - struct nf_conn *ct; - u_int32_t newmark; -+ u_int8_t dscp; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) -@@ -37,12 +38,24 @@ connmark_tg_shift(struct sk_buff *skb, c - - switch (info->mode) { - case XT_CONNMARK_SET: -- newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; -- if (info->shift_dir == D_SHIFT_RIGHT) -- newmark >>= info->shift_bits; -- else -- newmark <<= info->shift_bits; -+ newmark = ct->mark; -+ if (info->func & XT_CONNMARK_VALUE) { -+ newmark = (newmark & ~info->ctmask) ^ info->ctmark; -+ if (info->shift_dir == D_SHIFT_RIGHT) -+ newmark >>= info->shift_bits; -+ else -+ newmark <<= info->shift_bits; -+ } else if (info->func & XT_CONNMARK_DSCP) { -+ if (skb->protocol == htons(ETH_P_IP)) -+ dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; -+ else if (skb->protocol == htons(ETH_P_IPV6)) -+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; -+ else /* protocol doesn't have diffserv */ -+ break; - -+ newmark = (newmark & ~info->ctmark) | -+ (info->ctmask | (dscp << info->shift_bits)); -+ } - if (ct->mark != newmark) { - ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, ct); -@@ -81,20 +94,36 @@ static unsigned int - connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) - { - const struct xt_connmark_tginfo1 *info = par->targinfo; -- const struct xt_connmark_tginfo2 info2 = { -+ const struct xt_connmark_tginfo3 info3 = { - .ctmark = info->ctmark, - .ctmask = info->ctmask, - .nfmask = info->nfmask, - .mode = info->mode, -+ .func = XT_CONNMARK_VALUE - }; - -- return connmark_tg_shift(skb, &info2); -+ return connmark_tg_shift(skb, &info3); - } - - static unsigned int - connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) - { - const struct xt_connmark_tginfo2 *info = par->targinfo; -+ const struct xt_connmark_tginfo3 info3 = { -+ .ctmark = info->ctmark, -+ .ctmask = info->ctmask, -+ .nfmask = info->nfmask, -+ .mode = info->mode, -+ .func = XT_CONNMARK_VALUE -+ }; -+ -+ return connmark_tg_shift(skb, &info3); -+} -+ -+static unsigned int -+connmark_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) -+{ -+ const struct xt_connmark_tginfo3 *info = par->targinfo; - - return connmark_tg_shift(skb, info); - } -@@ -165,6 +194,16 @@ static struct xt_target connmark_tg_reg[ - .targetsize = sizeof(struct xt_connmark_tginfo2), - .destroy = connmark_tg_destroy, - .me = THIS_MODULE, -+ }, -+ { -+ .name = "CONNMARK", -+ .revision = 3, -+ .family = NFPROTO_UNSPEC, -+ .checkentry = connmark_tg_check, -+ .target = connmark_tg_v3, -+ .targetsize = sizeof(struct xt_connmark_tginfo3), -+ .destroy = connmark_tg_destroy, -+ .me = THIS_MODULE, - } - }; - diff --git a/root/target/linux/generic/hack-5.4/647-netfilter-flow-acct.patch b/root/target/linux/generic/hack-5.4/647-netfilter-flow-acct.patch deleted file mode 100755 index f9480d59..00000000 --- a/root/target/linux/generic/hack-5.4/647-netfilter-flow-acct.patch +++ /dev/null @@ -1,70 +0,0 @@ ---- a/include/net/netfilter/nf_flow_table.h -+++ b/include/net/netfilter/nf_flow_table.h -@@ -160,6 +160,8 @@ struct nf_flow_table_hw { - int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload); - void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload); - -+void nf_flow_table_acct(struct flow_offload *flow, struct sk_buff *skb, int dir); -+ - extern struct work_struct nf_flow_offload_hw_work; - - #define MODULE_ALIAS_NF_FLOWTABLE(family) \ ---- a/net/netfilter/nf_flow_table_core.c -+++ b/net/netfilter/nf_flow_table_core.c -@@ -13,6 +13,7 @@ - #include - #include - #include -+#include - - struct flow_offload_entry { - struct flow_offload flow; -@@ -164,6 +165,22 @@ void flow_offload_free(struct flow_offlo - } - EXPORT_SYMBOL_GPL(flow_offload_free); - -+void nf_flow_table_acct(struct flow_offload *flow, struct sk_buff *skb, int dir) -+{ -+ struct flow_offload_entry *entry; -+ struct nf_conn_acct *acct; -+ -+ entry = container_of(flow, struct flow_offload_entry, flow); -+ acct = nf_conn_acct_find(entry->ct); -+ if (acct) { -+ struct nf_conn_counter *counter = acct->counter; -+ -+ atomic64_inc(&counter[dir].packets); -+ atomic64_add(skb->len, &counter[dir].bytes); -+ } -+} -+EXPORT_SYMBOL_GPL(nf_flow_table_acct); -+ - static u32 flow_offload_hash(const void *data, u32 len, u32 seed) - { - const struct flow_offload_tuple *tuple = data; ---- a/net/netfilter/nf_flow_table_ip.c -+++ b/net/netfilter/nf_flow_table_ip.c -@@ -12,6 +12,7 @@ - #include - #include - #include -+ - /* For layer 4 checksum field offset. */ - #include - #include -@@ -296,6 +297,7 @@ nf_flow_offload_ip_hook(void *priv, stru - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); - skb_dst_set_noref(skb, &rt->dst); -+ nf_flow_table_acct(flow, skb, dir); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - - return NF_STOLEN; -@@ -526,6 +528,7 @@ nf_flow_offload_ipv6_hook(void *priv, st - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); - skb_dst_set_noref(skb, &rt->dst); -+ nf_flow_table_acct(flow, skb, dir); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - - return NF_STOLEN; diff --git a/root/target/linux/generic/hack-5.4/650-netfilter-add-xt_OFFLOAD-target.patch b/root/target/linux/generic/hack-5.4/650-netfilter-add-xt_OFFLOAD-target.patch deleted file mode 100755 index d584cb5c..00000000 --- a/root/target/linux/generic/hack-5.4/650-netfilter-add-xt_OFFLOAD-target.patch +++ /dev/null @@ -1,589 +0,0 @@ -From: Felix Fietkau -Date: Tue, 20 Feb 2018 15:56:02 +0100 -Subject: [PATCH] netfilter: add xt_OFFLOAD target - -Signed-off-by: Felix Fietkau ---- - create mode 100644 net/netfilter/xt_OFFLOAD.c - ---- a/net/ipv4/netfilter/Kconfig -+++ b/net/ipv4/netfilter/Kconfig -@@ -56,8 +56,6 @@ config NF_TABLES_ARP - help - This option enables the ARP support for nf_tables. - --endif # NF_TABLES -- - config NF_FLOW_TABLE_IPV4 - tristate "Netfilter flow table IPv4 module" - depends on NF_FLOW_TABLE -@@ -66,6 +64,8 @@ config NF_FLOW_TABLE_IPV4 - - To compile it as a module, choose M here. - -+endif # NF_TABLES -+ - config NF_DUP_IPV4 - tristate "Netfilter IPv4 packet duplication to alternate destination" - depends on !NF_CONNTRACK || NF_CONNTRACK ---- a/net/ipv6/netfilter/Kconfig -+++ b/net/ipv6/netfilter/Kconfig -@@ -45,7 +45,6 @@ config NFT_FIB_IPV6 - multicast or blackhole. - - endif # NF_TABLES_IPV6 --endif # NF_TABLES - - config NF_FLOW_TABLE_IPV6 - tristate "Netfilter flow table IPv6 module" -@@ -55,6 +54,8 @@ config NF_FLOW_TABLE_IPV6 - - To compile it as a module, choose M here. - -+endif # NF_TABLES -+ - config NF_DUP_IPV6 - tristate "Netfilter IPv6 packet duplication to alternate destination" - depends on !NF_CONNTRACK || NF_CONNTRACK ---- a/net/netfilter/Kconfig -+++ b/net/netfilter/Kconfig -@@ -690,8 +690,6 @@ config NFT_FIB_NETDEV - - endif # NF_TABLES_NETDEV - --endif # NF_TABLES -- - config NF_FLOW_TABLE_INET - tristate "Netfilter flow table mixed IPv4/IPv6 module" - depends on NF_FLOW_TABLE -@@ -700,11 +698,12 @@ config NF_FLOW_TABLE_INET - - To compile it as a module, choose M here. - -+endif # NF_TABLES -+ - config NF_FLOW_TABLE - tristate "Netfilter flow table module" - depends on NETFILTER_INGRESS - depends on NF_CONNTRACK -- depends on NF_TABLES - help - This option adds the flow table core infrastructure. - -@@ -993,6 +992,15 @@ config NETFILTER_XT_TARGET_NOTRACK - depends on NETFILTER_ADVANCED - select NETFILTER_XT_TARGET_CT - -+config NETFILTER_XT_TARGET_FLOWOFFLOAD -+ tristate '"FLOWOFFLOAD" target support' -+ depends on NF_FLOW_TABLE -+ depends on NETFILTER_INGRESS -+ help -+ This option adds a `FLOWOFFLOAD' target, which uses the nf_flow_offload -+ module to speed up processing of packets by bypassing the usual -+ netfilter chains -+ - config NETFILTER_XT_TARGET_RATEEST - tristate '"RATEEST" target support' - depends on NETFILTER_ADVANCED ---- a/net/netfilter/Makefile -+++ b/net/netfilter/Makefile -@@ -141,6 +141,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF - obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o - obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o - obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o -+obj-$(CONFIG_NETFILTER_XT_TARGET_FLOWOFFLOAD) += xt_FLOWOFFLOAD.o - obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o - obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o - obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o ---- /dev/null -+++ b/net/netfilter/xt_FLOWOFFLOAD.c -@@ -0,0 +1,427 @@ -+/* -+ * Copyright (C) 2018 Felix Fietkau -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct nf_flowtable nf_flowtable; -+static HLIST_HEAD(hooks); -+static DEFINE_SPINLOCK(hooks_lock); -+static struct delayed_work hook_work; -+ -+struct xt_flowoffload_hook { -+ struct hlist_node list; -+ struct nf_hook_ops ops; -+ struct net *net; -+ bool registered; -+ bool used; -+}; -+ -+static unsigned int -+xt_flowoffload_net_hook(void *priv, struct sk_buff *skb, -+ const struct nf_hook_state *state) -+{ -+ switch (skb->protocol) { -+ case htons(ETH_P_IP): -+ return nf_flow_offload_ip_hook(priv, skb, state); -+ case htons(ETH_P_IPV6): -+ return nf_flow_offload_ipv6_hook(priv, skb, state); -+ } -+ -+ return NF_ACCEPT; -+} -+ -+int nf_flow_table_iterate(struct nf_flowtable *flow_table, -+ void (*iter)(struct flow_offload *flow, void *data), -+ void *data); -+ -+static int -+xt_flowoffload_create_hook(struct net_device *dev) -+{ -+ struct xt_flowoffload_hook *hook; -+ struct nf_hook_ops *ops; -+ -+ hook = kzalloc(sizeof(*hook), GFP_ATOMIC); -+ if (!hook) -+ return -ENOMEM; -+ -+ ops = &hook->ops; -+ ops->pf = NFPROTO_NETDEV; -+ ops->hooknum = NF_NETDEV_INGRESS; -+ ops->priority = 10; -+ ops->priv = &nf_flowtable; -+ ops->hook = xt_flowoffload_net_hook; -+ ops->dev = dev; -+ -+ hlist_add_head(&hook->list, &hooks); -+ mod_delayed_work(system_power_efficient_wq, &hook_work, 0); -+ -+ return 0; -+} -+ -+static struct xt_flowoffload_hook * -+flow_offload_lookup_hook(struct net_device *dev) -+{ -+ struct xt_flowoffload_hook *hook; -+ -+ hlist_for_each_entry(hook, &hooks, list) { -+ if (hook->ops.dev == dev) -+ return hook; -+ } -+ -+ return NULL; -+} -+ -+static void -+xt_flowoffload_check_device(struct net_device *dev) -+{ -+ struct xt_flowoffload_hook *hook; -+ -+ spin_lock_bh(&hooks_lock); -+ hook = flow_offload_lookup_hook(dev); -+ if (hook) -+ hook->used = true; -+ else -+ xt_flowoffload_create_hook(dev); -+ spin_unlock_bh(&hooks_lock); -+} -+ -+static void -+xt_flowoffload_register_hooks(void) -+{ -+ struct xt_flowoffload_hook *hook; -+ -+restart: -+ hlist_for_each_entry(hook, &hooks, list) { -+ if (hook->registered) -+ continue; -+ -+ hook->registered = true; -+ hook->net = dev_net(hook->ops.dev); -+ spin_unlock_bh(&hooks_lock); -+ nf_register_net_hook(hook->net, &hook->ops); -+ spin_lock_bh(&hooks_lock); -+ goto restart; -+ } -+ -+} -+ -+static void -+xt_flowoffload_cleanup_hooks(void) -+{ -+ struct xt_flowoffload_hook *hook; -+ -+restart: -+ hlist_for_each_entry(hook, &hooks, list) { -+ if (hook->used || !hook->registered) -+ continue; -+ -+ hlist_del(&hook->list); -+ spin_unlock_bh(&hooks_lock); -+ nf_unregister_net_hook(hook->net, &hook->ops); -+ kfree(hook); -+ spin_lock_bh(&hooks_lock); -+ goto restart; -+ } -+ -+} -+ -+static void -+xt_flowoffload_check_hook(struct flow_offload *flow, void *data) -+{ -+ struct flow_offload_tuple *tuple = &flow->tuplehash[0].tuple; -+ struct xt_flowoffload_hook *hook; -+ bool *found = data; -+ struct rtable *rt = (struct rtable *)tuple->dst_cache; -+ -+ spin_lock_bh(&hooks_lock); -+ hlist_for_each_entry(hook, &hooks, list) { -+ if (hook->ops.dev->ifindex != tuple->iifidx && -+ hook->ops.dev->ifindex != rt->dst.dev->ifindex) -+ continue; -+ -+ hook->used = true; -+ *found = true; -+ } -+ spin_unlock_bh(&hooks_lock); -+} -+ -+static void -+xt_flowoffload_hook_work(struct work_struct *work) -+{ -+ struct xt_flowoffload_hook *hook; -+ bool found = false; -+ int err; -+ -+ spin_lock_bh(&hooks_lock); -+ xt_flowoffload_register_hooks(); -+ hlist_for_each_entry(hook, &hooks, list) -+ hook->used = false; -+ spin_unlock_bh(&hooks_lock); -+ -+ err = nf_flow_table_iterate(&nf_flowtable, xt_flowoffload_check_hook, -+ &found); -+ if (err && err != -EAGAIN) -+ goto out; -+ -+ spin_lock_bh(&hooks_lock); -+ xt_flowoffload_cleanup_hooks(); -+ spin_unlock_bh(&hooks_lock); -+ -+out: -+ if (found) -+ queue_delayed_work(system_power_efficient_wq, &hook_work, HZ); -+} -+ -+static bool -+xt_flowoffload_skip(struct sk_buff *skb, int family) -+{ -+ if (skb_sec_path(skb)) -+ return true; -+ -+ if (family == NFPROTO_IPV4) { -+ const struct ip_options *opt = &(IPCB(skb)->opt); -+ -+ if (unlikely(opt->optlen)) -+ return true; -+ } -+ -+ return false; -+} -+ -+static struct dst_entry * -+xt_flowoffload_dst(const struct nf_conn *ct, enum ip_conntrack_dir dir, -+ const struct xt_action_param *par, int ifindex) -+{ -+ struct dst_entry *dst = NULL; -+ struct flowi fl; -+ -+ memset(&fl, 0, sizeof(fl)); -+ switch (xt_family(par)) { -+ case NFPROTO_IPV4: -+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; -+ fl.u.ip4.flowi4_oif = ifindex; -+ break; -+ case NFPROTO_IPV6: -+ fl.u.ip6.saddr = ct->tuplehash[dir].tuple.dst.u3.in6; -+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; -+ fl.u.ip6.flowi6_oif = ifindex; -+ break; -+ } -+ -+ nf_route(xt_net(par), &dst, &fl, false, xt_family(par)); -+ -+ return dst; -+} -+ -+static int -+xt_flowoffload_route(struct sk_buff *skb, const struct nf_conn *ct, -+ const struct xt_action_param *par, -+ struct nf_flow_route *route, enum ip_conntrack_dir dir) -+{ -+ struct dst_entry *this_dst, *other_dst; -+ -+ this_dst = xt_flowoffload_dst(ct, !dir, par, xt_out(par)->ifindex); -+ other_dst = xt_flowoffload_dst(ct, dir, par, xt_in(par)->ifindex); -+ -+ route->tuple[dir].dst = this_dst; -+ route->tuple[!dir].dst = other_dst; -+ -+ if (!this_dst || !other_dst) -+ return -ENOENT; -+ -+ if (dst_xfrm(this_dst) || dst_xfrm(other_dst)) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+static unsigned int -+flowoffload_tg(struct sk_buff *skb, const struct xt_action_param *par) -+{ -+ const struct xt_flowoffload_target_info *info = par->targinfo; -+ struct tcphdr _tcph, *tcph = NULL; -+ enum ip_conntrack_info ctinfo; -+ enum ip_conntrack_dir dir; -+ struct nf_flow_route route; -+ struct flow_offload *flow = NULL; -+ struct nf_conn *ct; -+ struct net *net; -+ -+ if (xt_flowoffload_skip(skb, xt_family(par))) -+ return XT_CONTINUE; -+ -+ ct = nf_ct_get(skb, &ctinfo); -+ if (ct == NULL) -+ return XT_CONTINUE; -+ -+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { -+ case IPPROTO_TCP: -+ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) -+ return XT_CONTINUE; -+ -+ tcph = skb_header_pointer(skb, par->thoff, -+ sizeof(_tcph), &_tcph); -+ if (unlikely(!tcph || tcph->fin || tcph->rst)) -+ return XT_CONTINUE; -+ break; -+ case IPPROTO_UDP: -+ break; -+ default: -+ return XT_CONTINUE; -+ } -+ -+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || -+ ct->status & IPS_SEQ_ADJUST) -+ return XT_CONTINUE; -+ -+ if (!nf_ct_is_confirmed(ct)) -+ return XT_CONTINUE; -+ -+ if (!xt_in(par) || !xt_out(par)) -+ return XT_CONTINUE; -+ -+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) -+ return XT_CONTINUE; -+ -+ dir = CTINFO2DIR(ctinfo); -+ -+ if (xt_flowoffload_route(skb, ct, par, &route, dir) == 0) -+ flow = flow_offload_alloc(ct, &route); -+ -+ dst_release(route.tuple[dir].dst); -+ dst_release(route.tuple[!dir].dst); -+ -+ if (!flow) -+ goto err_flow_route; -+ -+ if (tcph) { -+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; -+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; -+ } -+ -+ if (flow_offload_add(&nf_flowtable, flow) < 0) -+ goto err_flow_add; -+ -+ xt_flowoffload_check_device(xt_in(par)); -+ xt_flowoffload_check_device(xt_out(par)); -+ -+ net = read_pnet(&nf_flowtable.ft_net); -+ if (!net) -+ write_pnet(&nf_flowtable.ft_net, xt_net(par)); -+ -+ if (info->flags & XT_FLOWOFFLOAD_HW) -+ nf_flow_offload_hw_add(xt_net(par), flow, ct); -+ -+ return XT_CONTINUE; -+ -+err_flow_add: -+ flow_offload_free(flow); -+err_flow_route: -+ clear_bit(IPS_OFFLOAD_BIT, &ct->status); -+ return XT_CONTINUE; -+} -+ -+ -+static int flowoffload_chk(const struct xt_tgchk_param *par) -+{ -+ struct xt_flowoffload_target_info *info = par->targinfo; -+ -+ if (info->flags & ~XT_FLOWOFFLOAD_MASK) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+static struct xt_target offload_tg_reg __read_mostly = { -+ .family = NFPROTO_UNSPEC, -+ .name = "FLOWOFFLOAD", -+ .revision = 0, -+ .targetsize = sizeof(struct xt_flowoffload_target_info), -+ .usersize = sizeof(struct xt_flowoffload_target_info), -+ .checkentry = flowoffload_chk, -+ .target = flowoffload_tg, -+ .me = THIS_MODULE, -+}; -+ -+static int xt_flowoffload_table_init(struct nf_flowtable *table) -+{ -+ table->flags = NF_FLOWTABLE_F_HW; -+ nf_flow_table_init(table); -+ return 0; -+} -+ -+static void xt_flowoffload_table_cleanup(struct nf_flowtable *table) -+{ -+ nf_flow_table_free(table); -+} -+ -+static int flow_offload_netdev_event(struct notifier_block *this, -+ unsigned long event, void *ptr) -+{ -+ struct xt_flowoffload_hook *hook = NULL; -+ struct net_device *dev = netdev_notifier_info_to_dev(ptr); -+ -+ if (event != NETDEV_UNREGISTER) -+ return NOTIFY_DONE; -+ -+ spin_lock_bh(&hooks_lock); -+ hook = flow_offload_lookup_hook(dev); -+ if (hook) { -+ hlist_del(&hook->list); -+ } -+ spin_unlock_bh(&hooks_lock); -+ if (hook) { -+ nf_unregister_net_hook(hook->net, &hook->ops); -+ kfree(hook); -+ } -+ -+ nf_flow_table_cleanup(dev); -+ -+ return NOTIFY_DONE; -+} -+ -+static struct notifier_block flow_offload_netdev_notifier = { -+ .notifier_call = flow_offload_netdev_event, -+}; -+ -+static int __init xt_flowoffload_tg_init(void) -+{ -+ int ret; -+ -+ register_netdevice_notifier(&flow_offload_netdev_notifier); -+ -+ INIT_DELAYED_WORK(&hook_work, xt_flowoffload_hook_work); -+ -+ ret = xt_flowoffload_table_init(&nf_flowtable); -+ if (ret) -+ return ret; -+ -+ ret = xt_register_target(&offload_tg_reg); -+ if (ret) -+ xt_flowoffload_table_cleanup(&nf_flowtable); -+ -+ return ret; -+} -+ -+static void __exit xt_flowoffload_tg_exit(void) -+{ -+ xt_unregister_target(&offload_tg_reg); -+ xt_flowoffload_table_cleanup(&nf_flowtable); -+ unregister_netdevice_notifier(&flow_offload_netdev_notifier); -+} -+ -+MODULE_LICENSE("GPL"); -+module_init(xt_flowoffload_tg_init); -+module_exit(xt_flowoffload_tg_exit); ---- a/net/netfilter/nf_flow_table_core.c -+++ b/net/netfilter/nf_flow_table_core.c -@@ -7,7 +7,6 @@ - #include - #include - #include --#include - #include - #include - #include -@@ -338,8 +337,7 @@ flow_offload_lookup(struct nf_flowtable - } - EXPORT_SYMBOL_GPL(flow_offload_lookup); - --static int --nf_flow_table_iterate(struct nf_flowtable *flow_table, -+int nf_flow_table_iterate(struct nf_flowtable *flow_table, - void (*iter)(struct flow_offload *flow, void *data), - void *data) - { -@@ -372,6 +370,7 @@ nf_flow_table_iterate(struct nf_flowtabl - - return err; - } -+EXPORT_SYMBOL_GPL(nf_flow_table_iterate); - - static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) - { ---- /dev/null -+++ b/include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h -@@ -0,0 +1,17 @@ -+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -+#ifndef _XT_FLOWOFFLOAD_H -+#define _XT_FLOWOFFLOAD_H -+ -+#include -+ -+enum { -+ XT_FLOWOFFLOAD_HW = 1 << 0, -+ -+ XT_FLOWOFFLOAD_MASK = XT_FLOWOFFLOAD_HW -+}; -+ -+struct xt_flowoffload_target_info { -+ __u32 flags; -+}; -+ -+#endif /* _XT_FLOWOFFLOAD_H */ ---- a/include/net/netfilter/nf_flow_table.h -+++ b/include/net/netfilter/nf_flow_table.h -@@ -130,6 +130,10 @@ static inline void flow_offload_dead(str - flow->flags |= FLOW_OFFLOAD_DYING; - } - -+int nf_flow_table_iterate(struct nf_flowtable *flow_table, -+ void (*iter)(struct flow_offload *flow, void *data), -+ void *data); -+ - int nf_flow_snat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir); diff --git a/root/target/linux/generic/hack-5.4/651-wireless_mesh_header.patch b/root/target/linux/generic/hack-5.4/651-wireless_mesh_header.patch deleted file mode 100755 index f545d8eb..00000000 --- a/root/target/linux/generic/hack-5.4/651-wireless_mesh_header.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 6d3bc769657b0ee7c7506dad9911111c4226a7ea Mon Sep 17 00:00:00 2001 -From: Imre Kaloz -Date: Fri, 7 Jul 2017 17:21:05 +0200 -Subject: mac80211: increase wireless mesh header size - -lede-commit 3d4466cfd8f75f717efdb1f96fdde3c70d865fc1 -Signed-off-by: Imre Kaloz ---- - include/linux/netdevice.h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - ---- a/include/linux/netdevice.h -+++ b/include/linux/netdevice.h -@@ -138,8 +138,8 @@ static inline bool dev_xmit_complete(int - - #if defined(CONFIG_HYPERV_NET) - # define LL_MAX_HEADER 128 --#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) --# if defined(CONFIG_MAC80211_MESH) -+#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) || 1 -+# if defined(CONFIG_MAC80211_MESH) || 1 - # define LL_MAX_HEADER 128 - # else - # define LL_MAX_HEADER 96 diff --git a/root/target/linux/generic/hack-5.4/660-fq_codel_defaults.patch b/root/target/linux/generic/hack-5.4/660-fq_codel_defaults.patch deleted file mode 100755 index 46bf0e3b..00000000 --- a/root/target/linux/generic/hack-5.4/660-fq_codel_defaults.patch +++ /dev/null @@ -1,27 +0,0 @@ -From a6ccb238939b25851474a279b20367fd24a0e816 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:21:53 +0200 -Subject: hack: net: fq_codel: tune defaults for small devices - -Assume that x86_64 devices always have a big memory and do not need this -optimization compared to devices with only 32 MB or 64 MB RAM. - -Signed-off-by: Felix Fietkau ---- - net/sched/sch_fq_codel.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/net/sched/sch_fq_codel.c -+++ b/net/sched/sch_fq_codel.c -@@ -470,7 +470,11 @@ static int fq_codel_init(struct Qdisc *s - - sch->limit = 10*1024; - q->flows_cnt = 1024; -+#ifdef CONFIG_X86_64 - q->memory_limit = 32 << 20; /* 32 MBytes */ -+#else -+ q->memory_limit = 4 << 20; /* 4 MBytes */ -+#endif - q->drop_batch_size = 64; - q->quantum = psched_mtu(qdisc_dev(sch)); - INIT_LIST_HEAD(&q->new_flows); diff --git a/root/target/linux/generic/hack-5.4/661-use_fq_codel_by_default.patch b/root/target/linux/generic/hack-5.4/661-use_fq_codel_by_default.patch deleted file mode 100755 index 11f1a25b..00000000 --- a/root/target/linux/generic/hack-5.4/661-use_fq_codel_by_default.patch +++ /dev/null @@ -1,100 +0,0 @@ -From 1d418f7e88035ed7a94073f6354246c66e9193e9 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:22:58 +0200 -Subject: fq_codel: switch default qdisc from pfifo_fast to fq_codel and remove pfifo_fast - -Signed-off-by: Felix Fietkau ---- - include/net/sch_generic.h | 3 ++- - net/sched/Kconfig | 3 ++- - net/sched/sch_api.c | 2 +- - net/sched/sch_fq_codel.c | 3 ++- - net/sched/sch_generic.c | 4 ++-- - 5 files changed, 9 insertions(+), 6 deletions(-) - ---- a/include/net/sch_generic.h -+++ b/include/net/sch_generic.h -@@ -617,12 +617,13 @@ extern struct Qdisc_ops noop_qdisc_ops; - extern struct Qdisc_ops pfifo_fast_ops; - extern struct Qdisc_ops mq_qdisc_ops; - extern struct Qdisc_ops noqueue_qdisc_ops; -+extern struct Qdisc_ops fq_codel_qdisc_ops; - extern const struct Qdisc_ops *default_qdisc_ops; - static inline const struct Qdisc_ops * - get_default_qdisc_ops(const struct net_device *dev, int ntx) - { - return ntx < dev->real_num_tx_queues ? -- default_qdisc_ops : &pfifo_fast_ops; -+ default_qdisc_ops : &fq_codel_qdisc_ops; - } - - struct Qdisc_class_common { ---- a/net/sched/Kconfig -+++ b/net/sched/Kconfig -@@ -4,8 +4,9 @@ - # - - menuconfig NET_SCHED -- bool "QoS and/or fair queueing" -+ def_bool y - select NET_SCH_FIFO -+ select NET_SCH_FQ_CODEL - ---help--- - When the kernel has several packets to send out over a network - device, it has to decide which ones to send first, which ones to ---- a/net/sched/sch_api.c -+++ b/net/sched/sch_api.c -@@ -2278,7 +2278,7 @@ static int __init pktsched_init(void) - return err; - } - -- register_qdisc(&pfifo_fast_ops); -+ register_qdisc(&fq_codel_qdisc_ops); - register_qdisc(&pfifo_qdisc_ops); - register_qdisc(&bfifo_qdisc_ops); - register_qdisc(&pfifo_head_drop_qdisc_ops); ---- a/net/sched/sch_fq_codel.c -+++ b/net/sched/sch_fq_codel.c -@@ -710,7 +710,7 @@ static const struct Qdisc_class_ops fq_c - .walk = fq_codel_walk, - }; - --static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { -+struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { - .cl_ops = &fq_codel_class_ops, - .id = "fq_codel", - .priv_size = sizeof(struct fq_codel_sched_data), -@@ -725,6 +725,7 @@ static struct Qdisc_ops fq_codel_qdisc_o - .dump_stats = fq_codel_dump_stats, - .owner = THIS_MODULE, - }; -+EXPORT_SYMBOL(fq_codel_qdisc_ops); - - static int __init fq_codel_module_init(void) - { ---- a/net/sched/sch_generic.c -+++ b/net/sched/sch_generic.c -@@ -32,7 +32,7 @@ - #include - - /* Qdisc to use by default */ --const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; -+const struct Qdisc_ops *default_qdisc_ops = &fq_codel_qdisc_ops; - EXPORT_SYMBOL(default_qdisc_ops); - - static void qdisc_maybe_clear_missed(struct Qdisc *q, -@@ -1079,12 +1079,12 @@ static void attach_one_default_qdisc(str - void *_unused) - { - struct Qdisc *qdisc; -- const struct Qdisc_ops *ops = default_qdisc_ops; -+ const struct Qdisc_ops *ops = &fq_codel_qdisc_ops; - - if (dev->priv_flags & IFF_NO_QUEUE) - ops = &noqueue_qdisc_ops; - else if(dev->type == ARPHRD_CAN) -- ops = &pfifo_fast_ops; -+ ops = &fq_codel_qdisc_ops; - - qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); - if (!qdisc) { diff --git a/root/target/linux/generic/hack-5.4/662-remove_pfifo_fast.patch b/root/target/linux/generic/hack-5.4/662-remove_pfifo_fast.patch deleted file mode 100755 index 9df3a825..00000000 --- a/root/target/linux/generic/hack-5.4/662-remove_pfifo_fast.patch +++ /dev/null @@ -1,243 +0,0 @@ -From b531d492d5ef1cf9dba0f4888eb5fd8624a6d762 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:23:42 +0200 -Subject: net: sched: switch default qdisc from pfifo_fast to fq_codel and remove pfifo_fast - -Signed-off-by: Felix Fietkau ---- - net/sched/sch_generic.c | 140 ------------------------------------------------ - 1 file changed, 140 deletions(-) - ---- a/net/sched/sch_generic.c -+++ b/net/sched/sch_generic.c -@@ -620,230 +620,6 @@ struct Qdisc_ops noqueue_qdisc_ops __rea - .owner = THIS_MODULE, - }; - --static const u8 prio2band[TC_PRIO_MAX + 1] = { -- 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 --}; -- --/* 3-band FIFO queue: old style, but should be a bit faster than -- generic prio+fifo combination. -- */ -- --#define PFIFO_FAST_BANDS 3 -- --/* -- * Private data for a pfifo_fast scheduler containing: -- * - rings for priority bands -- */ --struct pfifo_fast_priv { -- struct skb_array q[PFIFO_FAST_BANDS]; --}; -- --static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, -- int band) --{ -- return &priv->q[band]; --} -- --static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, -- struct sk_buff **to_free) --{ -- int band = prio2band[skb->priority & TC_PRIO_MAX]; -- struct pfifo_fast_priv *priv = qdisc_priv(qdisc); -- struct skb_array *q = band2list(priv, band); -- unsigned int pkt_len = qdisc_pkt_len(skb); -- int err; -- -- err = skb_array_produce(q, skb); -- -- if (unlikely(err)) { -- if (qdisc_is_percpu_stats(qdisc)) -- return qdisc_drop_cpu(skb, qdisc, to_free); -- else -- return qdisc_drop(skb, qdisc, to_free); -- } -- -- qdisc_update_stats_at_enqueue(qdisc, pkt_len); -- return NET_XMIT_SUCCESS; --} -- --static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) --{ -- struct pfifo_fast_priv *priv = qdisc_priv(qdisc); -- struct sk_buff *skb = NULL; -- bool need_retry = true; -- int band; -- --retry: -- for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { -- struct skb_array *q = band2list(priv, band); -- -- if (__skb_array_empty(q)) -- continue; -- -- skb = __skb_array_consume(q); -- } -- if (likely(skb)) { -- qdisc_update_stats_at_dequeue(qdisc, skb); -- } else if (need_retry && -- test_bit(__QDISC_STATE_MISSED, &qdisc->state)) { -- /* Delay clearing the STATE_MISSED here to reduce -- * the overhead of the second spin_trylock() in -- * qdisc_run_begin() and __netif_schedule() calling -- * in qdisc_run_end(). -- */ -- clear_bit(__QDISC_STATE_MISSED, &qdisc->state); -- -- /* Make sure dequeuing happens after clearing -- * STATE_MISSED. -- */ -- smp_mb__after_atomic(); -- -- need_retry = false; -- -- goto retry; -- } else { -- WRITE_ONCE(qdisc->empty, true); -- } -- -- return skb; --} -- --static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) --{ -- struct pfifo_fast_priv *priv = qdisc_priv(qdisc); -- struct sk_buff *skb = NULL; -- int band; -- -- for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { -- struct skb_array *q = band2list(priv, band); -- -- skb = __skb_array_peek(q); -- } -- -- return skb; --} -- --static void pfifo_fast_reset(struct Qdisc *qdisc) --{ -- int i, band; -- struct pfifo_fast_priv *priv = qdisc_priv(qdisc); -- -- for (band = 0; band < PFIFO_FAST_BANDS; band++) { -- struct skb_array *q = band2list(priv, band); -- struct sk_buff *skb; -- -- /* NULL ring is possible if destroy path is due to a failed -- * skb_array_init() in pfifo_fast_init() case. -- */ -- if (!q->ring.queue) -- continue; -- -- while ((skb = __skb_array_consume(q)) != NULL) -- kfree_skb(skb); -- } -- -- if (qdisc_is_percpu_stats(qdisc)) { -- for_each_possible_cpu(i) { -- struct gnet_stats_queue *q; -- -- q = per_cpu_ptr(qdisc->cpu_qstats, i); -- q->backlog = 0; -- q->qlen = 0; -- } -- } --} -- --static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) --{ -- struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; -- -- memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); -- if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) -- goto nla_put_failure; -- return skb->len; -- --nla_put_failure: -- return -1; --} -- --static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, -- struct netlink_ext_ack *extack) --{ -- unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; -- struct pfifo_fast_priv *priv = qdisc_priv(qdisc); -- int prio; -- -- /* guard against zero length rings */ -- if (!qlen) -- return -EINVAL; -- -- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { -- struct skb_array *q = band2list(priv, prio); -- int err; -- -- err = skb_array_init(q, qlen, GFP_KERNEL); -- if (err) -- return -ENOMEM; -- } -- -- /* Can by-pass the queue discipline */ -- qdisc->flags |= TCQ_F_CAN_BYPASS; -- return 0; --} -- --static void pfifo_fast_destroy(struct Qdisc *sch) --{ -- struct pfifo_fast_priv *priv = qdisc_priv(sch); -- int prio; -- -- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { -- struct skb_array *q = band2list(priv, prio); -- -- /* NULL ring is possible if destroy path is due to a failed -- * skb_array_init() in pfifo_fast_init() case. -- */ -- if (!q->ring.queue) -- continue; -- /* Destroy ring but no need to kfree_skb because a call to -- * pfifo_fast_reset() has already done that work. -- */ -- ptr_ring_cleanup(&q->ring, NULL); -- } --} -- --static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, -- unsigned int new_len) --{ -- struct pfifo_fast_priv *priv = qdisc_priv(sch); -- struct skb_array *bands[PFIFO_FAST_BANDS]; -- int prio; -- -- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { -- struct skb_array *q = band2list(priv, prio); -- -- bands[prio] = q; -- } -- -- return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, -- GFP_KERNEL); --} -- --struct Qdisc_ops pfifo_fast_ops __read_mostly = { -- .id = "pfifo_fast", -- .priv_size = sizeof(struct pfifo_fast_priv), -- .enqueue = pfifo_fast_enqueue, -- .dequeue = pfifo_fast_dequeue, -- .peek = pfifo_fast_peek, -- .init = pfifo_fast_init, -- .destroy = pfifo_fast_destroy, -- .reset = pfifo_fast_reset, -- .dump = pfifo_fast_dump, -- .change_tx_queue_len = pfifo_fast_change_tx_queue_len, -- .owner = THIS_MODULE, -- .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, --}; --EXPORT_SYMBOL(pfifo_fast_ops); -- - struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - const struct Qdisc_ops *ops, - struct netlink_ext_ack *extack) diff --git a/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch b/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch index 850613b5..e2c39495 100755 --- a/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch +++ b/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch @@ -1,47 +1,24123 @@ ---- a/net/core/dev.c 2018-08-10 10:31:41.199494561 +0200 -+++ b/net/core/dev.c 2018-08-10 10:32:03.635272509 +0200 -@@ -6613,9 +6613,11 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index dbb68067ba4e..b6c32a29789e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2742,6 +2742,10 @@ + allocations which rules out almost all kernel + allocations. Use with caution! + ++ mptcp_htable_entries= ++ [KNL,NET] Set number of hash buckets for MPTCP token ++ hashtables. ++ + MTD_Partition= [MTD] + Format: ,,, + +diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt +index 8af3771a3ebf..e8fecb8f6370 100644 +--- a/Documentation/networking/ip-sysctl.txt ++++ b/Documentation/networking/ip-sysctl.txt +@@ -818,6 +818,18 @@ tcp_rx_skb_cache - BOOLEAN + + Default: 0 (disabled) + ++MPTCP variables: ++ ++mptcp_enabled - INTEGER ++ Enable or disable Multipath TCP for new connections. ++ Possible values are: ++ ++ 0: Multipath TCP is disabled on all TCP-sockets that are newly created. ++ 1: Multipath TCP is enabled by default on all new TCP-sockets. Note that ++ existing sockets in LISTEN-state will still use regular TCP. ++ 2: Enables Multipath TCP only upon the request of the application ++ throught the socket-option MPTCP_ENABLED. ++ + UDP variables: + + udp_l3mdev_accept - BOOLEAN +diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c +index 535ee41ee421..9f82f93e6e77 100644 +--- a/drivers/infiniband/hw/cxgb4/cm.c ++++ b/drivers/infiniband/hw/cxgb4/cm.c +@@ -3950,7 +3950,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) + */ + memset(&tmp_opt, 0, sizeof(tmp_opt)); + tcp_clear_options(&tmp_opt); +- tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL); ++ tcp_parse_options(&init_net, skb, &tmp_opt, NULL, 0, NULL, NULL); + + req = __skb_push(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index b04b5bd43f54..57e35d51db8c 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -717,7 +717,7 @@ struct sk_buff { + * want to keep them across layers you have to do a skb_clone() + * first. This is owned by whoever has the skb queued ATM. + */ +- char cb[48] __aligned(8); ++ char cb[80] __aligned(8); + + union { + struct { +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 358deb4ff830..aebfedba9838 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -54,7 +54,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) + /* TCP Fast Open */ + #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ + #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ +-#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ ++#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */ + + /* TCP Fast Open Cookie as stored in memory */ + struct tcp_fastopen_cookie { +@@ -74,6 +74,56 @@ struct tcp_sack_block { + u32 end_seq; + }; + ++struct tcp_out_options { ++ u16 options; /* bit field of OPTION_* */ ++ u16 mss; /* 0 to disable */ ++ u8 ws; /* window scale, 0 to disable */ ++ u8 num_sack_blocks; /* number of SACK blocks to include */ ++ u8 hash_size; /* bytes in hash_location */ ++ __u8 *hash_location; /* temporary pointer, overloaded */ ++ __u32 tsval, tsecr; /* need to include OPTION_TS */ ++ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ ++#ifdef CONFIG_MPTCP ++ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */ ++ u8 dss_csum:1, /* dss-checksum required? */ ++ add_addr_v4:1, ++ add_addr_v6:1, ++ mptcp_ver:4; ++ ++ union { ++ struct { ++ __u64 sender_key; /* sender's key for mptcp */ ++ __u64 receiver_key; /* receiver's key for mptcp */ ++ } mp_capable; ++ ++ struct { ++ __u64 sender_truncated_mac; ++ __u32 sender_nonce; ++ /* random number of the sender */ ++ __u32 token; /* token for mptcp */ ++ u8 low_prio:1; ++ } mp_join_syns; ++ }; ++ ++ struct { ++ __u64 trunc_mac; ++ struct in_addr addr; ++ u16 port; ++ u8 addr_id; ++ } add_addr4; ++ ++ struct { ++ __u64 trunc_mac; ++ struct in6_addr addr; ++ u16 port; ++ u8 addr_id; ++ } add_addr6; ++ ++ u16 remove_addrs; /* list of address id */ ++ u8 addr_id; /* address id (mp_join or add_address) */ ++#endif /* CONFIG_MPTCP */ ++}; ++ + /*These are used to set the sack_ok field in struct tcp_options_received */ + #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ + #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ +@@ -97,6 +147,9 @@ struct tcp_options_received { + u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ + }; + ++struct mptcp_cb; ++struct mptcp_tcp_sock; ++ + static inline void tcp_clear_options(struct tcp_options_received *rx_opt) + { + rx_opt->tstamp_ok = rx_opt->sack_ok = 0; +@@ -135,6 +188,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) + return (struct tcp_request_sock *)req; + } + ++struct tcp_md5sig_key; ++ + struct tcp_sock { + /* inet_connection_sock has to be the first member of tcp_sock */ + struct inet_connection_sock inet_conn; +@@ -295,6 +350,7 @@ struct tcp_sock { + u32 rate_interval_us; /* saved rate sample: time elapsed */ + + u32 rcv_wnd; /* Current receiver window */ ++ u32 rcv_right_edge; /* Highest announced right edge */ + u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ + u32 pushed_seq; /* Last pushed seq, required to talk to windows */ +@@ -397,6 +453,44 @@ struct tcp_sock { + */ + struct request_sock __rcu *fastopen_rsk; + u32 *saved_syn; ++ ++ /* MPTCP/TCP-specific callbacks */ ++ const struct tcp_sock_ops *ops; ++ ++ struct mptcp_cb *mpcb; ++ struct sock *meta_sk; ++ /* We keep these flags even if CONFIG_MPTCP is not checked, because ++ * it allows checking MPTCP capability just by checking the mpc flag, ++ * rather than adding ifdefs everywhere. ++ */ ++ u32 mpc:1, /* Other end is multipath capable */ ++ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */ ++ send_mp_fclose:1, ++ request_mptcp:1, /* Did we send out an MP_CAPABLE? ++ * (this speeds up mptcp_doit() in tcp_recvmsg) ++ */ ++ pf:1, /* Potentially Failed state: when this flag is set, we ++ * stop using the subflow ++ */ ++ mp_killed:1, /* Killed with a tcp_done in mptcp? */ ++ is_master_sk:1, ++ close_it:1, /* Must close socket in mptcp_data_ready? */ ++ closing:1, ++ mptcp_ver:4, ++ mptcp_sched_setsockopt:1, ++ mptcp_pm_setsockopt:1, ++ record_master_info:1, ++ tcp_disconnect:1; ++ struct mptcp_tcp_sock *mptcp; ++#ifdef CONFIG_MPTCP ++#define MPTCP_SCHED_NAME_MAX 16 ++#define MPTCP_PM_NAME_MAX 16 ++ struct hlist_nulls_node tk_table; ++ u32 mptcp_loc_token; ++ u64 mptcp_loc_key; ++ char mptcp_sched_name[MPTCP_SCHED_NAME_MAX]; ++ char mptcp_pm_name[MPTCP_PM_NAME_MAX]; ++#endif /* CONFIG_MPTCP */ + }; + + enum tsq_enum { +@@ -408,6 +502,8 @@ enum tsq_enum { + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call + * tcp_v{4|6}_mtu_reduced() + */ ++ MPTCP_PATH_MANAGER_DEFERRED, /* MPTCP deferred creation of new subflows */ ++ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */ + }; + + enum tsq_flags { +@@ -417,6 +513,8 @@ enum tsq_flags { + TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), + TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), + TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), ++ TCPF_PATH_MANAGER_DEFERRED = (1UL << MPTCP_PATH_MANAGER_DEFERRED), ++ TCPF_SUB_DEFERRED = (1UL << MPTCP_SUB_DEFERRED), + }; + + static inline struct tcp_sock *tcp_sk(const struct sock *sk) +@@ -440,6 +538,7 @@ struct tcp_timewait_sock { + #ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *tw_md5_key; + #endif ++ struct mptcp_tw *mptcp_tw; + }; + + static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) +diff --git a/include/net/inet_common.h b/include/net/inet_common.h +index ae2ba897675c..aa91a56bd7af 100644 +--- a/include/net/inet_common.h ++++ b/include/net/inet_common.h +@@ -2,6 +2,7 @@ + #ifndef _INET_COMMON_H + #define _INET_COMMON_H + ++#include + #include + + extern const struct proto_ops inet_stream_ops; +@@ -16,6 +17,8 @@ struct sock; + struct sockaddr; + struct socket; + ++int inet_create(struct net *net, struct socket *sock, int protocol, int kern); ++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern); + int inet_release(struct socket *sock); + int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags); +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index 13792c0ef46e..e99cc510610f 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -25,6 +25,7 @@ + + struct inet_bind_bucket; + struct tcp_congestion_ops; ++struct tcp_options_received; + + /* + * Pointers to address related TCP functions +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h +index 34c4436fd18f..828f79528b32 100644 +--- a/include/net/inet_sock.h ++++ b/include/net/inet_sock.h +@@ -79,7 +79,7 @@ struct inet_request_sock { + #define ireq_state req.__req_common.skc_state + #define ireq_family req.__req_common.skc_family + +- u16 snd_wscale : 4, ++ u32 snd_wscale : 4, + rcv_wscale : 4, + tstamp_ok : 1, + sack_ok : 1, +@@ -87,6 +87,8 @@ struct inet_request_sock { + ecn_ok : 1, + acked : 1, + no_srccheck: 1, ++ mptcp_rqsk : 1, ++ saw_mpc : 1, + smc_ok : 1; + u32 ir_mark; + union { +diff --git a/include/net/mptcp.h b/include/net/mptcp.h +new file mode 100644 +index 000000000000..196b8939cbab +--- /dev/null ++++ b/include/net/mptcp.h +@@ -0,0 +1,1577 @@ ++/* ++ * MPTCP implementation ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#ifndef _MPTCP_H ++#define _MPTCP_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ #define ntohll(x) be64_to_cpu(x) ++ #define htonll(x) cpu_to_be64(x) ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ #define ntohll(x) (x) ++ #define htonll(x) (x) ++#endif ++ ++struct mptcp_loc4 { ++ u8 loc4_id; ++ u8 low_prio:1; ++ int if_idx; ++ struct in_addr addr; ++}; ++ ++struct mptcp_rem4 { ++ u8 rem4_id; ++ __be16 port; ++ struct in_addr addr; ++}; ++ ++struct mptcp_loc6 { ++ u8 loc6_id; ++ u8 low_prio:1; ++ int if_idx; ++ struct in6_addr addr; ++}; ++ ++struct mptcp_rem6 { ++ u8 rem6_id; ++ __be16 port; ++ struct in6_addr addr; ++}; ++ ++struct mptcp_request_sock { ++ struct tcp_request_sock req; ++ struct hlist_nulls_node hash_entry; ++ ++ union { ++ struct { ++ /* Only on initial subflows */ ++ u64 mptcp_loc_key; ++ u64 mptcp_rem_key; ++ u32 mptcp_loc_token; ++ }; ++ ++ struct { ++ /* Only on additional subflows */ ++ u32 mptcp_rem_nonce; ++ u32 mptcp_loc_nonce; ++ u64 mptcp_hash_tmac; ++ }; ++ }; ++ ++ u8 loc_id; ++ u8 rem_id; /* Address-id in the MP_JOIN */ ++ u16 dss_csum:1, ++ rem_key_set:1, ++ is_sub:1, /* Is this a new subflow? */ ++ low_prio:1, /* Interface set to low-prio? */ ++ rcv_low_prio:1, ++ mptcp_ver:4; ++}; ++ ++struct mptcp_options_received { ++ u16 saw_mpc:1, ++ dss_csum:1, ++ drop_me:1, ++ ++ is_mp_join:1, ++ join_ack:1, ++ ++ saw_low_prio:2, /* 0x1 - low-prio set for this subflow ++ * 0x2 - low-prio set for another subflow ++ */ ++ low_prio:1, ++ ++ saw_add_addr:2, /* Saw at least one add_addr option: ++ * 0x1: IPv4 - 0x2: IPv6 ++ */ ++ more_add_addr:1, /* Saw one more add-addr. */ ++ ++ saw_rem_addr:1, /* Saw at least one rem_addr option */ ++ more_rem_addr:1, /* Saw one more rem-addr. */ ++ ++ mp_fail:1, ++ mp_fclose:1; ++ u8 rem_id; /* Address-id in the MP_JOIN */ ++ u8 prio_addr_id; /* Address-id in the MP_PRIO */ ++ ++ const unsigned char *add_addr_ptr; /* Pointer to add-address option */ ++ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */ ++ ++ u32 data_ack; ++ u32 data_seq; ++ u16 data_len; ++ ++ u8 mptcp_ver; /* MPTCP version */ ++ ++ /* Key inside the option (from mp_capable or fast_close) */ ++ u64 mptcp_sender_key; ++ u64 mptcp_receiver_key; ++ ++ u32 mptcp_rem_token; /* Remote token */ ++ ++ u32 mptcp_recv_nonce; ++ u64 mptcp_recv_tmac; ++ u8 mptcp_recv_mac[20]; ++}; ++ ++struct mptcp_tcp_sock { ++ struct hlist_node node; ++ struct hlist_node cb_list; ++ struct mptcp_options_received rx_opt; ++ ++ /* Those three fields record the current mapping */ ++ u64 map_data_seq; ++ u32 map_subseq; ++ u16 map_data_len; ++ u16 slave_sk:1, ++ fully_established:1, ++ second_packet:1, ++ attached:1, ++ send_mp_fail:1, ++ include_mpc:1, ++ mapping_present:1, ++ map_data_fin:1, ++ low_prio:1, /* use this socket as backup */ ++ rcv_low_prio:1, /* Peer sent low-prio option to us */ ++ send_mp_prio:1, /* Trigger to send mp_prio on this socket */ ++ pre_established:1; /* State between sending 3rd ACK and ++ * receiving the fourth ack of new subflows. ++ */ ++ ++ /* isn: needed to translate abs to relative subflow seqnums */ ++ u32 snt_isn; ++ u32 rcv_isn; ++ u8 path_index; ++ u8 loc_id; ++ u8 rem_id; ++ u8 sk_err; ++ ++#define MPTCP_SCHED_SIZE 16 ++ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8); ++ ++ int init_rcv_wnd; ++ u32 infinite_cutoff_seq; ++ struct delayed_work work; ++ u32 mptcp_loc_nonce; ++ struct tcp_sock *tp; ++ u32 last_end_data_seq; ++ ++ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */ ++ struct timer_list mptcp_ack_timer; ++ ++ /* HMAC of the third ack */ ++ char sender_mac[SHA256_DIGEST_SIZE]; ++}; ++ ++struct mptcp_tw { ++ struct list_head list; ++ u64 loc_key; ++ u64 rcv_nxt; ++ struct mptcp_cb __rcu *mpcb; ++ u8 meta_tw:1, ++ in_list:1; ++}; ++ ++#define MPTCP_PM_NAME_MAX 16 ++struct mptcp_pm_ops { ++ struct list_head list; ++ ++ /* Signal the creation of a new MPTCP-session. */ ++ void (*new_session)(const struct sock *meta_sk); ++ void (*release_sock)(struct sock *meta_sk); ++ void (*fully_established)(struct sock *meta_sk); ++ void (*close_session)(struct sock *meta_sk); ++ void (*new_remote_address)(struct sock *meta_sk); ++ int (*get_local_id)(const struct sock *meta_sk, sa_family_t family, ++ union inet_addr *addr, bool *low_prio); ++ void (*addr_signal)(struct sock *sk, unsigned *size, ++ struct tcp_out_options *opts, struct sk_buff *skb); ++ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr, ++ sa_family_t family, __be16 port, u8 id); ++ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id); ++ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr); ++ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr); ++ void (*established_subflow)(struct sock *sk); ++ void (*delete_subflow)(struct sock *sk); ++ void (*prio_changed)(struct sock *sk, int low_prio); ++ ++ char name[MPTCP_PM_NAME_MAX]; ++ struct module *owner; ++}; ++ ++struct mptcp_sched_ops { ++ struct list_head list; ++ ++ struct sock * (*get_subflow)(struct sock *meta_sk, ++ struct sk_buff *skb, ++ bool zero_wnd_test); ++ struct sk_buff * (*next_segment)(struct sock *meta_sk, ++ int *reinject, ++ struct sock **subsk, ++ unsigned int *limit); ++ void (*init)(struct sock *sk); ++ void (*release)(struct sock *sk); ++ ++ char name[MPTCP_SCHED_NAME_MAX]; ++ struct module *owner; ++}; ++ ++struct mptcp_cb { ++ /* list of sockets in this multipath connection */ ++ struct hlist_head conn_list; ++ /* list of sockets that need a call to release_cb */ ++ struct hlist_head callback_list; ++ ++ /* Lock used for protecting the different rcu-lists of mptcp_cb */ ++ spinlock_t mpcb_list_lock; ++ ++ /* High-order bits of 64-bit sequence numbers */ ++ u32 snd_high_order[2]; ++ u32 rcv_high_order[2]; ++ ++ u16 send_infinite_mapping:1, ++ send_mptcpv1_mpcapable:1, ++ rem_key_set:1, ++ in_time_wait:1, ++ list_rcvd:1, /* XXX TO REMOVE */ ++ addr_signal:1, /* Path-manager wants us to call addr_signal */ ++ dss_csum:1, ++ server_side:1, ++ infinite_mapping_rcv:1, ++ infinite_mapping_snd:1, ++ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */ ++ passive_close:1, ++ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */ ++ rcv_hiseq_index:1, /* Index in rcv_high_order of rcv_nxt */ ++ tcp_ca_explicit_set:1; /* was meta CC set by app? */ ++ ++#define MPTCP_SCHED_DATA_SIZE 8 ++ u8 mptcp_sched[MPTCP_SCHED_DATA_SIZE] __aligned(8); ++ const struct mptcp_sched_ops *sched_ops; ++ ++ struct sk_buff_head reinject_queue; ++ /* First cache-line boundary is here minus 8 bytes. But from the ++ * reinject-queue only the next and prev pointers are regularly ++ * accessed. Thus, the whole data-path is on a single cache-line. ++ */ ++ ++ u64 csum_cutoff_seq; ++ u64 infinite_rcv_seq; ++ ++ /***** Start of fields, used for connection closure */ ++ unsigned char mptw_state; ++ u8 dfin_path_index; ++ ++ struct list_head tw_list; ++ ++ /***** Start of fields, used for subflow establishment and closure */ ++ refcount_t mpcb_refcnt; ++ ++ /* Mutex needed, because otherwise mptcp_close will complain that the ++ * socket is owned by the user. ++ * E.g., mptcp_sub_close_wq is taking the meta-lock. ++ */ ++ struct mutex mpcb_mutex; ++ ++ /***** Start of fields, used for subflow establishment */ ++ struct sock *meta_sk; ++ ++ /* Master socket, also part of the conn_list, this ++ * socket is the one that the application sees. ++ */ ++ struct sock *master_sk; ++ ++ __u64 mptcp_loc_key; ++ __u64 mptcp_rem_key; ++ __u32 mptcp_loc_token; ++ __u32 mptcp_rem_token; ++ ++#define MPTCP_PM_SIZE 608 ++ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8); ++ const struct mptcp_pm_ops *pm_ops; ++ ++ unsigned long path_index_bits; ++ ++ __u8 mptcp_ver; ++ ++ /* Original snd/rcvbuf of the initial subflow. ++ * Used for the new subflows on the server-side to allow correct ++ * autotuning ++ */ ++ int orig_sk_rcvbuf; ++ int orig_sk_sndbuf; ++ u32 orig_window_clamp; ++ ++ struct tcp_info *master_info; ++}; ++ ++#define MPTCP_VERSION_0 0 ++#define MPTCP_VERSION_1 1 ++ ++#define MPTCP_SUB_CAPABLE 0 ++#define MPTCP_SUB_LEN_CAPABLE_SYN 12 ++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12 ++#define MPTCP_SUB_LEN_CAPABLE_ACK 20 ++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 ++ ++#define MPTCPV1_SUB_LEN_CAPABLE_SYN 4 ++#define MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN 4 ++#define MPTCPV1_SUB_LEN_CAPABLE_SYNACK 12 ++#define MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN 12 ++#define MPTCPV1_SUB_LEN_CAPABLE_ACK 20 ++#define MPTCPV1_SUB_LEN_CAPABLE_ACK_ALIGN 20 ++#define MPTCPV1_SUB_LEN_CAPABLE_DATA 22 ++#define MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM 24 ++#define MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN 24 ++ ++#define MPTCP_SUB_JOIN 1 ++#define MPTCP_SUB_LEN_JOIN_SYN 12 ++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 ++#define MPTCP_SUB_LEN_JOIN_SYNACK 16 ++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16 ++#define MPTCP_SUB_LEN_JOIN_ACK 24 ++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24 ++ ++#define MPTCP_SUB_DSS 2 ++#define MPTCP_SUB_LEN_DSS 4 ++#define MPTCP_SUB_LEN_DSS_ALIGN 4 ++ ++/* Lengths for seq and ack are the ones without the generic MPTCP-option header, ++ * as they are part of the DSS-option. ++ * To get the total length, just add the different options together. ++ */ ++#define MPTCP_SUB_LEN_SEQ 10 ++#define MPTCP_SUB_LEN_SEQ_CSUM 12 ++#define MPTCP_SUB_LEN_SEQ_ALIGN 12 ++ ++#define MPTCP_SUB_LEN_SEQ_64 14 ++#define MPTCP_SUB_LEN_SEQ_CSUM_64 16 ++#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16 ++ ++#define MPTCP_SUB_LEN_ACK 4 ++#define MPTCP_SUB_LEN_ACK_ALIGN 4 ++ ++#define MPTCP_SUB_LEN_ACK_64 8 ++#define MPTCP_SUB_LEN_ACK_64_ALIGN 8 ++ ++/* This is the "default" option-length we will send out most often. ++ * MPTCP DSS-header ++ * 32-bit data sequence number ++ * 32-bit data ack ++ * ++ * It is necessary to calculate the effective MSS we will be using when ++ * sending data. ++ */ ++#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \ ++ MPTCP_SUB_LEN_SEQ_ALIGN + \ ++ MPTCP_SUB_LEN_ACK_ALIGN) ++ ++#define MPTCP_SUB_ADD_ADDR 3 ++#define MPTCP_SUB_LEN_ADD_ADDR4 8 ++#define MPTCP_SUB_LEN_ADD_ADDR4_VER1 16 ++#define MPTCP_SUB_LEN_ADD_ADDR6 20 ++#define MPTCP_SUB_LEN_ADD_ADDR6_VER1 28 ++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8 ++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 16 ++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20 ++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 28 ++ ++#define MPTCP_SUB_REMOVE_ADDR 4 ++#define MPTCP_SUB_LEN_REMOVE_ADDR 4 ++ ++#define MPTCP_SUB_PRIO 5 ++#define MPTCP_SUB_LEN_PRIO 3 ++#define MPTCP_SUB_LEN_PRIO_ADDR 4 ++#define MPTCP_SUB_LEN_PRIO_ALIGN 4 ++ ++#define MPTCP_SUB_FAIL 6 ++#define MPTCP_SUB_LEN_FAIL 12 ++#define MPTCP_SUB_LEN_FAIL_ALIGN 12 ++ ++#define MPTCP_SUB_FCLOSE 7 ++#define MPTCP_SUB_LEN_FCLOSE 12 ++#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12 ++ ++ ++#define OPTION_MPTCP (1 << 5) ++ ++/* Max number of fastclose retransmissions */ ++#define MPTCP_FASTCLOSE_RETRIES 3 ++ ++#ifdef CONFIG_MPTCP ++ ++/* Used for checking if the mptcp initialization has been successful */ ++extern bool mptcp_init_failed; ++ ++/* MPTCP options */ ++#define OPTION_TYPE_SYN (1 << 0) ++#define OPTION_TYPE_SYNACK (1 << 1) ++#define OPTION_TYPE_ACK (1 << 2) ++#define OPTION_MP_CAPABLE (1 << 3) ++#define OPTION_DATA_ACK (1 << 4) ++#define OPTION_ADD_ADDR (1 << 5) ++#define OPTION_MP_JOIN (1 << 6) ++#define OPTION_MP_FAIL (1 << 7) ++#define OPTION_MP_FCLOSE (1 << 8) ++#define OPTION_REMOVE_ADDR (1 << 9) ++#define OPTION_MP_PRIO (1 << 10) ++ ++/* MPTCP flags: both TX and RX */ ++#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */ ++#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */ ++#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */ ++#define MPTCPHDR_MPC_DATA 0x08 ++/* MPTCP flags: RX only */ ++#define MPTCPHDR_ACK 0x10 ++#define MPTCPHDR_SEQ64_SET 0x20 /* Did we received a 64-bit seq number? */ ++#define MPTCPHDR_SEQ64_OFO 0x40 /* Is it not in our circular array? */ ++#define MPTCPHDR_DSS_CSUM 0x80 ++/* MPTCP flags: TX only */ ++#define MPTCPHDR_INF 0x10 ++#define MPTCP_REINJECT 0x20 /* Did we reinject this segment? */ ++ ++struct mptcp_option { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 ver:4, ++ sub:4; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u8 sub:4, ++ ver:4; ++#else ++#error "Adjust your defines" ++#endif ++}; ++ ++struct mp_capable { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 ver:4, ++ sub:4; ++ __u8 h:1, ++ rsv:5, ++ b:1, ++ a:1; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u8 sub:4, ++ ver:4; ++ __u8 a:1, ++ b:1, ++ rsv:5, ++ h:1; ++#else ++#error "Adjust your defines" ++#endif ++ __u64 sender_key; ++ __u64 receiver_key; ++} __attribute__((__packed__)); ++ ++struct mp_join { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 b:1, ++ rsv:3, ++ sub:4; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u8 sub:4, ++ rsv:3, ++ b:1; ++#else ++#error "Adjust your defines" ++#endif ++ __u8 addr_id; ++ union { ++ struct { ++ u32 token; ++ u32 nonce; ++ } syn; ++ struct { ++ __u64 mac; ++ u32 nonce; ++ } synack; ++ struct { ++ __u8 mac[20]; ++ } ack; ++ } u; ++} __attribute__((__packed__)); ++ ++struct mp_dss { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u16 rsv1:4, ++ sub:4, ++ A:1, ++ a:1, ++ M:1, ++ m:1, ++ F:1, ++ rsv2:3; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u16 sub:4, ++ rsv1:4, ++ rsv2:3, ++ F:1, ++ m:1, ++ M:1, ++ a:1, ++ A:1; ++#else ++#error "Adjust your defines" ++#endif ++}; ++ ++struct mp_add_addr { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ union { ++ struct { ++ __u8 ipver:4, ++ sub:4; ++ } v0; ++ struct { ++ __u8 echo:1, ++ rsv:3, ++ sub:4; ++ } v1; ++ } u_bit; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ union { ++ struct { ++ __u8 sub:4, ++ ipver:4; ++ } v0; ++ struct { ++ __u8 sub:4, ++ rsv:3, ++ echo:1; ++ } v1; ++ } u_bit; ++#else ++#error "Adjust your defines" ++#endif ++ __u8 addr_id; ++ union { ++ struct { ++ struct in_addr addr; ++ __be16 port; ++ __u8 mac[8]; ++ } v4; ++ struct { ++ struct in6_addr addr; ++ __be16 port; ++ __u8 mac[8]; ++ } v6; ++ } u; ++} __attribute__((__packed__)); ++ ++struct mp_remove_addr { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 rsv:4, ++ sub:4; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u8 sub:4, ++ rsv:4; ++#else ++#error "Adjust your defines" ++#endif ++ /* list of addr_id */ ++ __u8 addrs_id; ++}; ++ ++struct mp_fail { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u16 rsv1:4, ++ sub:4, ++ rsv2:8; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u16 sub:4, ++ rsv1:4, ++ rsv2:8; ++#else ++#error "Adjust your defines" ++#endif ++ __be64 data_seq; ++} __attribute__((__packed__)); ++ ++struct mp_fclose { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u16 rsv1:4, ++ sub:4, ++ rsv2:8; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u16 sub:4, ++ rsv1:4, ++ rsv2:8; ++#else ++#error "Adjust your defines" ++#endif ++ __u64 key; ++} __attribute__((__packed__)); ++ ++struct mp_prio { ++ __u8 kind; ++ __u8 len; ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 b:1, ++ rsv:3, ++ sub:4; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ __u8 sub:4, ++ rsv:3, ++ b:1; ++#else ++#error "Adjust your defines" ++#endif ++ __u8 addr_id; ++} __attribute__((__packed__)); ++ ++struct mptcp_hashtable { ++ struct hlist_nulls_head *hashtable; ++ unsigned int mask; ++}; ++ ++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum) ++{ ++ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); ++} ++ ++#define MPTCP_ENABLE 0x01 ++#define MPTCP_SOCKOPT 0x02 ++#define MPTCP_CLIENT_DISABLE 0x04 ++#define MPTCP_SERVER_DISABLE 0x08 ++ ++extern int sysctl_mptcp_enabled; ++extern int sysctl_mptcp_version; ++extern int sysctl_mptcp_checksum; ++extern int sysctl_mptcp_debug; ++extern int sysctl_mptcp_syn_retries; ++ ++extern struct workqueue_struct *mptcp_wq; ++ ++#define mptcp_debug(fmt, args...) \ ++ do { \ ++ if (unlikely(sysctl_mptcp_debug)) \ ++ pr_err(fmt, ##args); \ ++ } while (0) ++ ++static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) ++{ ++ return (struct sock *)mptcp->tp; ++} ++ ++#define mptcp_for_each_sub(__mpcb, __mptcp) \ ++ hlist_for_each_entry_rcu(__mptcp, &((__mpcb)->conn_list), node) ++ ++/* Must be called with the appropriate lock held */ ++#define mptcp_for_each_sub_safe(__mpcb, __mptcp, __tmp) \ ++ hlist_for_each_entry_safe(__mptcp, __tmp, &((__mpcb)->conn_list), node) ++ ++/* Iterates over all bit set to 1 in a bitset */ ++#define mptcp_for_each_bit_set(b, i) \ ++ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1) ++ ++#define mptcp_for_each_bit_unset(b, i) \ ++ mptcp_for_each_bit_set(~b, i) ++ ++#define MPTCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mptcp.mptcp_statistics, field) ++#define MPTCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mptcp.mptcp_statistics, field) ++ ++enum ++{ ++ MPTCP_MIB_NUM = 0, ++ MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ ++ MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */ ++ MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */ ++ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ ++ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ ++ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ ++ MPTCP_MIB_MPCAPABLERETRANSFALLBACK,/* Client-side stopped sending MP_CAPABLE after too many SYN-retransmissions */ ++ MPTCP_MIB_CSUMENABLED, /* Created MPTCP-connection with DSS-checksum enabled */ ++ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ ++ MPTCP_MIB_MPFAILRX, /* Received an MP_FAIL */ ++ MPTCP_MIB_CSUMFAIL, /* Received segment with invalid checksum */ ++ MPTCP_MIB_FASTCLOSERX, /* Recevied a FAST_CLOSE */ ++ MPTCP_MIB_FASTCLOSETX, /* Sent a FAST_CLOSE */ ++ MPTCP_MIB_FBACKSUB, /* Fallback upon ack without data-ack on new subflow */ ++ MPTCP_MIB_FBACKINIT, /* Fallback upon ack without data-ack on initial subflow */ ++ MPTCP_MIB_FBDATASUB, /* Fallback upon data without DSS at the beginning on new subflow */ ++ MPTCP_MIB_FBDATAINIT, /* Fallback upon data without DSS at the beginning on initial subflow */ ++ MPTCP_MIB_REMADDRSUB, /* Remove subflow due to REMOVE_ADDR */ ++ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ ++ MPTCP_MIB_JOINFALLBACK, /* Received MP_JOIN on session that has fallen back to reg. TCP */ ++ MPTCP_MIB_JOINSYNTX, /* Sent a SYN + MP_JOIN */ ++ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ ++ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ ++ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ ++ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ ++ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ ++ MPTCP_MIB_JOINACKFAIL, /* Third ACK on new subflow did not contain an MP_JOIN */ ++ MPTCP_MIB_JOINACKRTO, /* Retransmission timer for third ACK + MP_JOIN timed out */ ++ MPTCP_MIB_JOINACKRXMIT, /* Retransmitted an ACK + MP_JOIN */ ++ MPTCP_MIB_NODSSWINDOW, /* Received too many packets without a DSS-option */ ++ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ ++ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ ++ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ ++ MPTCP_MIB_DSSTRIMHEAD, /* Trimmed segment at the head (coalescing middlebox) */ ++ MPTCP_MIB_DSSSPLITTAIL, /* Trimmed segment at the tail (coalescing middlebox) */ ++ MPTCP_MIB_PURGEOLD, /* Removed old skb from the rcv-queue due to missing DSS-mapping */ ++ MPTCP_MIB_ADDADDRRX, /* Received an ADD_ADDR */ ++ MPTCP_MIB_ADDADDRTX, /* Sent an ADD_ADDR */ ++ MPTCP_MIB_REMADDRRX, /* Received a REMOVE_ADDR */ ++ MPTCP_MIB_REMADDRTX, /* Sent a REMOVE_ADDR */ ++ MPTCP_MIB_JOINALTERNATEPORT, /* Established a subflow on a different destination port-number */ ++ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */ ++ __MPTCP_MIB_MAX ++}; ++ ++#define MPTCP_MIB_MAX __MPTCP_MIB_MAX ++struct mptcp_mib { ++ unsigned long mibs[MPTCP_MIB_MAX]; ++}; ++ ++extern struct lock_class_key meta_key; ++extern char *meta_key_name; ++extern struct lock_class_key meta_slock_key; ++extern char *meta_slock_key_name; ++ ++extern siphash_key_t mptcp_secret; ++ ++/* This is needed to ensure that two subsequent key/nonce-generation result in ++ * different keys/nonces if the IPs and ports are the same. ++ */ ++extern u32 mptcp_seed; ++ ++extern struct mptcp_hashtable mptcp_tk_htable; ++ ++/* Request-sockets can be hashed in the tk_htb for collision-detection or in ++ * the regular htb for join-connections. We need to define different NULLS ++ * values so that we can correctly detect a request-socket that has been ++ * recycled. See also c25eb3bfb9729. ++ */ ++#define MPTCP_REQSK_NULLS_BASE (1U << 29) ++ ++ ++void mptcp_data_ready(struct sock *sk); ++void mptcp_write_space(struct sock *sk); ++ ++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb, ++ struct sock *sk); ++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied); ++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, ++ gfp_t flags); ++void mptcp_del_sock(struct sock *sk); ++void mptcp_update_metasocket(const struct sock *meta_sk); ++void mptcp_reinject_data(struct sock *orig_sk, int clone_it); ++void mptcp_update_sndbuf(const struct tcp_sock *tp); ++void mptcp_send_fin(struct sock *meta_sk); ++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority); ++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ++ int push_one, gfp_t gfp); ++void tcp_parse_mptcp_options(const struct sk_buff *skb, ++ struct mptcp_options_received *mopt); ++void mptcp_parse_options(const uint8_t *ptr, int opsize, ++ struct mptcp_options_received *mopt, ++ const struct sk_buff *skb, ++ struct tcp_sock *tp); ++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, ++ unsigned *remaining); ++void mptcp_synack_options(struct request_sock *req, ++ struct tcp_out_options *opts, ++ unsigned *remaining); ++void mptcp_established_options(struct sock *sk, struct sk_buff *skb, ++ struct tcp_out_options *opts, unsigned *size); ++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, ++ const struct tcp_out_options *opts, ++ struct sk_buff *skb); ++void mptcp_close(struct sock *meta_sk, long timeout); ++bool mptcp_doit(struct sock *sk); ++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, ++ int rem_key_set, __u8 mptcp_ver, u32 window); ++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req); ++int mptcp_check_req_master(struct sock *sk, struct sock *child, ++ struct request_sock *req, const struct sk_buff *skb, ++ const struct mptcp_options_received *mopt, ++ int drop, u32 tsoff); ++struct sock *mptcp_check_req_child(struct sock *meta_sk, ++ struct sock *child, ++ struct request_sock *req, ++ struct sk_buff *skb, ++ const struct mptcp_options_received *mopt); ++u32 __mptcp_select_window(struct sock *sk); ++void mptcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, ++ __u32 *rcv_wnd, __u32 *window_clamp, ++ int wscale_ok, __u8 *rcv_wscale, ++ __u32 init_rcv_wnd); ++unsigned int mptcp_current_mss(struct sock *meta_sk); ++void mptcp_hmac(u8 ver, const u8 *key_1, const u8 *key_2, u8 *hash_out, ++ int arg_num, ...); ++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk); ++void mptcp_fin(struct sock *meta_sk); ++void mptcp_meta_retransmit_timer(struct sock *meta_sk); ++void mptcp_sub_retransmit_timer(struct sock *sk); ++int mptcp_write_wakeup(struct sock *meta_sk, int mib); ++void mptcp_sub_close_wq(struct work_struct *work); ++void mptcp_sub_close(struct sock *sk, unsigned long delay); ++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk); ++void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb); ++void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb, ++ __u64 remote_key); ++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); ++void mptcp_ack_handler(struct timer_list *t); ++bool mptcp_check_rtt(const struct tcp_sock *tp, int time); ++int mptcp_check_snd_buf(const struct tcp_sock *tp); ++bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th, ++ const struct sk_buff *skb); ++void __init mptcp_init(void); ++void mptcp_destroy_sock(struct sock *sk); ++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, ++ const struct sk_buff *skb, ++ const struct mptcp_options_received *mopt); ++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, ++ int large_allowed); ++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw); ++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw); ++void mptcp_time_wait(struct sock *sk, int state, int timeo); ++void mptcp_disconnect(struct sock *meta_sk); ++bool mptcp_should_expand_sndbuf(const struct sock *sk); ++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb); ++void mptcp_tsq_flags(struct sock *sk); ++void mptcp_tsq_sub_deferred(struct sock *meta_sk); ++struct mp_join *mptcp_find_join(const struct sk_buff *skb); ++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp); ++struct sock *mptcp_hash_find(const struct net *net, const u32 token); ++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw); ++int mptcp_do_join_short(struct sk_buff *skb, ++ const struct mptcp_options_received *mopt, ++ struct net *net); ++void mptcp_reqsk_destructor(struct request_sock *req); ++void mptcp_connect_init(struct sock *sk); ++void mptcp_sub_force_close(struct sock *sk); ++int mptcp_sub_len_remove_addr_align(u16 bitfield); ++void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb, ++ const struct request_sock *req, ++ struct sk_buff *skb); ++void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, bool want_cookie); ++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb); ++void mptcp_enable_sock(struct sock *sk); ++void mptcp_disable_sock(struct sock *sk); ++void mptcp_disable_static_key(void); ++void mptcp_cookies_reqsk_init(struct request_sock *req, ++ struct mptcp_options_received *mopt, ++ struct sk_buff *skb); ++void mptcp_mpcb_put(struct mptcp_cb *mpcb); ++int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb); ++int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen); ++void mptcp_clear_sk(struct sock *sk, int size); ++ ++/* MPTCP-path-manager registration/initialization functions */ ++int mptcp_register_path_manager(struct mptcp_pm_ops *pm); ++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm); ++void mptcp_init_path_manager(struct mptcp_cb *mpcb); ++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb); ++void mptcp_fallback_default(struct mptcp_cb *mpcb); ++void mptcp_get_default_path_manager(char *name); ++int mptcp_set_scheduler(struct sock *sk, const char *name); ++int mptcp_set_path_manager(struct sock *sk, const char *name); ++int mptcp_set_default_path_manager(const char *name); ++extern struct mptcp_pm_ops mptcp_pm_default; ++ ++/* MPTCP-scheduler registration/initialization functions */ ++int mptcp_register_scheduler(struct mptcp_sched_ops *sched); ++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); ++void mptcp_init_scheduler(struct mptcp_cb *mpcb); ++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb); ++void mptcp_get_default_scheduler(char *name); ++int mptcp_set_default_scheduler(const char *name); ++bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, ++ bool zero_wnd_test); ++bool mptcp_is_def_unavailable(struct sock *sk); ++bool subflow_is_active(const struct tcp_sock *tp); ++bool subflow_is_backup(const struct tcp_sock *tp); ++struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, ++ bool zero_wnd_test); ++struct sk_buff *mptcp_next_segment(struct sock *meta_sk, ++ int *reinject, ++ struct sock **subsk, ++ unsigned int *limit); ++extern struct mptcp_sched_ops mptcp_sched_default; ++ ++/* Initializes function-pointers and MPTCP-flags */ ++static inline void mptcp_init_tcp_sock(struct sock *sk) ++{ ++ if (!mptcp_init_failed && sysctl_mptcp_enabled == MPTCP_ENABLE) ++ mptcp_enable_sock(sk); ++} ++ ++static inline void mptcp_init_listen(struct sock *sk) ++{ ++ if (!mptcp_init_failed && ++ sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && ++#ifdef CONFIG_TCP_MD5SIG ++ !rcu_access_pointer(tcp_sk(sk)->md5sig_info) && ++#endif ++ sysctl_mptcp_enabled & MPTCP_ENABLE && ++ !(sysctl_mptcp_enabled & MPTCP_SERVER_DISABLE)) ++ mptcp_enable_sock(sk); ++} ++ ++static inline void mptcp_init_connect(struct sock *sk) ++{ ++ if (!mptcp_init_failed && ++ sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && ++#ifdef CONFIG_TCP_MD5SIG ++ !rcu_access_pointer(tcp_sk(sk)->md5sig_info) && ++#endif ++ sysctl_mptcp_enabled & MPTCP_ENABLE && ++ !(sysctl_mptcp_enabled & MPTCP_CLIENT_DISABLE)) ++ mptcp_enable_sock(sk); ++} ++ ++static inline int mptcp_pi_to_flag(int pi) ++{ ++ return 1 << (pi - 1); ++} ++ ++static inline ++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req) ++{ ++ return (struct mptcp_request_sock *)req; ++} ++ ++static inline ++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req) ++{ ++ return (struct request_sock *)req; ++} ++ ++static inline bool mptcp_can_sendpage(struct sock *sk) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ ++ if (tcp_sk(sk)->mpcb->dss_csum) ++ return false; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (!(sk_it->sk_route_caps & NETIF_F_SG)) ++ return false; ++ } ++ ++ return true; ++} ++ ++static inline void mptcp_push_pending_frames(struct sock *meta_sk) ++{ ++ /* We check packets out and send-head here. TCP only checks the ++ * send-head. But, MPTCP also checks packets_out, as this is an ++ * indication that we might want to do opportunistic reinjection. ++ */ ++ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) { ++ struct tcp_sock *tp = tcp_sk(meta_sk); ++ ++ /* We don't care about the MSS, because it will be set in ++ * mptcp_write_xmit. ++ */ ++ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle); ++ } ++} ++ ++static inline void mptcp_send_reset(struct sock *sk) ++{ ++ if (tcp_need_reset(sk->sk_state)) ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); ++ mptcp_sub_force_close(sk); ++} ++ ++static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb, ++ struct sock *except) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (sk_it != except) ++ mptcp_send_reset(sk_it); ++ } ++} ++ ++static inline bool mptcp_is_data_mpcapable(const struct sk_buff *skb) ++{ ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_MPC_DATA; ++} ++ ++static inline bool mptcp_is_data_seq(const struct sk_buff *skb) ++{ ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; ++} ++ ++static inline bool mptcp_is_data_fin(const struct sk_buff *skb) ++{ ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN; ++} ++ ++/* Is it a data-fin while in infinite mapping mode? ++ * In infinite mode, a subflow-fin is in fact a data-fin. ++ */ ++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb, ++ const struct tcp_sock *tp) ++{ ++ return mptcp_is_data_fin(skb) || ++ (tp->mpcb->infinite_mapping_rcv && ++ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)); ++} ++ ++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb) ++{ ++ u64 data_seq_high = (u32)(data_seq >> 32); ++ ++ if (mpcb->rcv_high_order[0] == data_seq_high) ++ return 0; ++ else if (mpcb->rcv_high_order[1] == data_seq_high) ++ return MPTCPHDR_SEQ64_INDEX; ++ else ++ return MPTCPHDR_SEQ64_OFO; ++} ++ ++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq. ++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits. ++ */ ++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, ++ u32 *data_seq, ++ struct mptcp_cb *mpcb) ++{ ++ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); ++ ++ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) { ++ u64 data_seq64 = get_unaligned_be64(ptr); ++ ++ if (mpcb) ++ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb); ++ ++ *data_seq = (u32)data_seq64; ++ ptr++; ++ } else { ++ *data_seq = get_unaligned_be32(ptr); ++ } ++ ++ return ptr; ++} ++ ++static inline struct sock *mptcp_meta_sk(const struct sock *sk) ++{ ++ return tcp_sk(sk)->meta_sk; ++} ++ ++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) ++{ ++ return tcp_sk(tp->meta_sk); ++} ++ ++static inline int is_meta_tp(const struct tcp_sock *tp) ++{ ++ return tp->mpcb && mptcp_meta_tp(tp) == tp; ++} ++ ++static inline int is_meta_sk(const struct sock *sk) ++{ ++ return sk->sk_state != TCP_NEW_SYN_RECV && ++ sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && ++ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk; ++} ++ ++static inline int is_master_tp(const struct tcp_sock *tp) ++{ ++ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp)); ++} ++ ++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt) ++{ ++ mopt->saw_mpc = 0; ++ mopt->dss_csum = 0; ++ mopt->drop_me = 0; ++ ++ mopt->is_mp_join = 0; ++ mopt->join_ack = 0; ++ ++ mopt->saw_low_prio = 0; ++ mopt->low_prio = 0; ++ ++ mopt->saw_add_addr = 0; ++ mopt->more_add_addr = 0; ++ ++ mopt->saw_rem_addr = 0; ++ mopt->more_rem_addr = 0; ++ ++ mopt->mp_fail = 0; ++ mopt->mp_fclose = 0; ++} ++ ++static inline void mptcp_reset_mopt(struct tcp_sock *tp) ++{ ++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; ++ ++ mopt->saw_low_prio = 0; ++ mopt->saw_add_addr = 0; ++ mopt->more_add_addr = 0; ++ mopt->saw_rem_addr = 0; ++ mopt->more_rem_addr = 0; ++ mopt->join_ack = 0; ++ mopt->mp_fail = 0; ++ mopt->mp_fclose = 0; ++} ++ ++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, ++ const struct mptcp_cb *mpcb) ++{ ++ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags & ++ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]); ++} ++ ++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, ++ u32 data_seq_32) ++{ ++ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32; ++} ++ ++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp) ++{ ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, ++ meta_tp->rcv_nxt); ++} ++ ++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc) ++{ ++ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) { ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; ++ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2; ++ } ++} ++ ++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, ++ u32 old_rcv_nxt) ++{ ++ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) { ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2; ++ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1; ++ } ++} ++ ++static inline int mptcp_sk_can_send(const struct sock *sk) ++{ ++ return tcp_passive_fastopen(sk) || ++ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && ++ !tcp_sk(sk)->mptcp->pre_established); ++} ++ ++static inline int mptcp_sk_can_recv(const struct sock *sk) ++{ ++ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2); ++} ++ ++static inline int mptcp_sk_can_send_ack(const struct sock *sk) ++{ ++ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | ++ TCPF_CLOSE | TCPF_LISTEN)) && ++ !tcp_sk(sk)->mptcp->pre_established; ++} ++ ++static inline bool mptcp_can_sg(const struct sock *meta_sk) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ ++ if (tcp_sk(meta_sk)->mpcb->dss_csum) ++ return false; ++ ++ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if (!mptcp_sk_can_send(sk)) ++ continue; ++ if (!(sk->sk_route_caps & NETIF_F_SG)) ++ return false; ++ } ++ return true; ++} ++ ++static inline void mptcp_set_rto(struct sock *sk) ++{ ++ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk)); ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_tcp_sock *mptcp; ++ __u32 max_rto = 0; ++ ++ /* We are in recovery-phase on the MPTCP-level. Do not update the ++ * RTO, because this would kill exponential backoff. ++ */ ++ if (micsk->icsk_retransmits) ++ return; ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if ((mptcp_sk_can_send(sk_it) || sk_it->sk_state == TCP_SYN_RECV) && ++ inet_csk(sk_it)->icsk_retransmits == 0 && ++ inet_csk(sk_it)->icsk_backoff == 0 && ++ inet_csk(sk_it)->icsk_rto > max_rto) ++ max_rto = inet_csk(sk_it)->icsk_rto; ++ } ++ if (max_rto) { ++ micsk->icsk_rto = max_rto << 1; ++ ++ /* A successfull rto-measurement - reset backoff counter */ ++ micsk->icsk_backoff = 0; ++ } ++} ++ ++static inline void mptcp_sub_close_passive(struct sock *sk) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk); ++ ++ /* Only close, if the app did a send-shutdown (passive close), and we ++ * received the data-ack of the data-fin. ++ */ ++ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq) ++ mptcp_sub_close(sk, 0); ++} ++ ++static inline void mptcp_fallback_close(struct mptcp_cb *mpcb, ++ struct sock *except) ++{ ++ mptcp_sub_force_close_all(mpcb, except); ++ ++ if (mpcb->pm_ops->close_session) ++ mpcb->pm_ops->close_session(mptcp_meta_sk(except)); ++} ++ ++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ ++ /* If data has been acknowleged on the meta-level, fully_established ++ * will have been set before and thus we will not fall back to infinite ++ * mapping. ++ */ ++ if (likely(tp->mptcp->fully_established)) ++ return false; ++ ++ if (!(flag & MPTCP_FLAG_DATA_ACKED)) ++ return false; ++ ++ /* Don't fallback twice ;) */ ++ if (mpcb->infinite_mapping_snd) ++ return false; ++ ++ pr_debug("%s %#x will fallback - pi %d, src %pI4:%u dst %pI4:%u rcv_nxt %u from %pS\n", ++ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, ++ &inet_sk(sk)->inet_saddr, ntohs(inet_sk(sk)->inet_sport), ++ &inet_sk(sk)->inet_daddr, ntohs(inet_sk(sk)->inet_dport), ++ tp->rcv_nxt, __builtin_return_address(0)); ++ if (!is_master_tp(tp)) { ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB); ++ return true; ++ } ++ ++ mpcb->infinite_mapping_snd = 1; ++ mpcb->infinite_mapping_rcv = 1; ++ mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); ++ tp->mptcp->fully_established = 1; ++ ++ mptcp_fallback_close(mpcb, sk); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKINIT); ++ ++ return false; ++} ++ ++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk) ++{ ++ return sk->sk_family == AF_INET6 && ++ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED; ++} ++ ++/* We are in or are becoming to be in infinite mapping mode */ ++static inline bool mptcp_in_infinite_mapping_weak(const struct mptcp_cb *mpcb) ++{ ++ return mpcb->infinite_mapping_rcv || ++ mpcb->infinite_mapping_snd || ++ mpcb->send_infinite_mapping; ++} ++ ++static inline bool mptcp_can_new_subflow(const struct sock *meta_sk) ++{ ++ /* Has been removed from the tk-table. Thus, no new subflows. ++ * ++ * Check for close-state is necessary, because we may have been closed ++ * without passing by mptcp_close(). ++ * ++ * When falling back, no new subflows are allowed either. ++ */ ++ return meta_sk->sk_state != TCP_CLOSE && ++ tcp_sk(meta_sk)->inside_tk_table && ++ !tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv && ++ !tcp_sk(meta_sk)->mpcb->send_infinite_mapping; ++} ++ ++static inline int mptcp_subflow_count(const struct mptcp_cb *mpcb) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ int i = 0; ++ ++ mptcp_for_each_sub(mpcb, mptcp) ++ i++; ++ ++ return i; ++} ++ ++/* TCP and MPTCP mpc flag-depending functions */ ++u16 mptcp_select_window(struct sock *sk); ++void mptcp_tcp_set_rto(struct sock *sk); ++ ++#else /* CONFIG_MPTCP */ ++#define mptcp_debug(fmt, args...) \ ++ do { \ ++ } while (0) ++ ++static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) ++{ ++ return NULL; ++} ++ ++#define mptcp_for_each_sub(__mpcb, __mptcp) \ ++ if (0) ++ ++#define MPTCP_INC_STATS(net, field) \ ++ do { \ ++ } while(0) ++ ++#define MPTCP_DEC_STATS(net, field) \ ++ do { \ ++ } while(0) ++ ++static inline bool mptcp_is_data_fin(const struct sk_buff *skb) ++{ ++ return false; ++} ++static inline bool mptcp_is_data_seq(const struct sk_buff *skb) ++{ ++ return false; ++} ++static inline struct sock *mptcp_meta_sk(const struct sock *sk) ++{ ++ return NULL; ++} ++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) ++{ ++ return NULL; ++} ++static inline int is_meta_sk(const struct sock *sk) ++{ ++ return 0; ++} ++static inline int is_master_tp(const struct tcp_sock *tp) ++{ ++ return 0; ++} ++static inline void mptcp_del_sock(const struct sock *sk) {} ++static inline void mptcp_update_metasocket(const struct sock *meta_sk) {} ++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {} ++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {} ++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb, ++ const struct sock *sk) {} ++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {} ++static inline void mptcp_set_rto(const struct sock *sk) {} ++static inline void mptcp_send_fin(const struct sock *meta_sk) {} ++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize, ++ struct mptcp_options_received *mopt, ++ const struct sk_buff *skb, ++ const struct tcp_sock *tp) {} ++static inline void mptcp_syn_options(const struct sock *sk, ++ struct tcp_out_options *opts, ++ unsigned *remaining) {} ++static inline void mptcp_synack_options(struct request_sock *req, ++ struct tcp_out_options *opts, ++ unsigned *remaining) {} ++ ++static inline void mptcp_established_options(struct sock *sk, ++ struct sk_buff *skb, ++ struct tcp_out_options *opts, ++ unsigned *size) {} ++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, ++ const struct tcp_out_options *opts, ++ struct sk_buff *skb) {} ++static inline void mptcp_close(struct sock *meta_sk, long timeout) {} ++static inline bool mptcp_doit(struct sock *sk) ++{ ++ return false; ++} ++static inline int mptcp_check_req_fastopen(struct sock *child, ++ struct request_sock *req) ++{ ++ return 1; ++} ++static inline int mptcp_check_req_master(const struct sock *sk, ++ const struct sock *child, ++ const struct request_sock *req, ++ const struct sk_buff *skb, ++ const struct mptcp_options_received *mopt, ++ int drop, ++ u32 tsoff) ++{ ++ return 1; ++} ++static inline struct sock *mptcp_check_req_child(const struct sock *meta_sk, ++ const struct sock *child, ++ const struct request_sock *req, ++ struct sk_buff *skb, ++ const struct mptcp_options_received *mopt) ++{ ++ return NULL; ++} ++static inline unsigned int mptcp_current_mss(struct sock *meta_sk) ++{ ++ return 0; ++} ++static inline void mptcp_sub_close_passive(struct sock *sk) {} ++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag) ++{ ++ return false; ++} ++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {} ++static inline void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb) {} ++static inline bool mptcp_check_rtt(const struct tcp_sock *tp, int time) ++{ ++ return false; ++} ++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp) ++{ ++ return 0; ++} ++static inline void mptcp_push_pending_frames(struct sock *meta_sk) {} ++static inline void mptcp_send_reset(const struct sock *sk) {} ++static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb, ++ struct sock *except) {} ++static inline bool mptcp_handle_options(struct sock *sk, ++ const struct tcphdr *th, ++ struct sk_buff *skb) ++{ ++ return false; ++} ++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {} ++static inline void __init mptcp_init(void) {} ++static inline bool mptcp_can_sg(const struct sock *meta_sk) ++{ ++ return false; ++} ++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, ++ u32 mss_now, int large_allowed) ++{ ++ return 0; ++} ++static inline void mptcp_destroy_sock(struct sock *sk) {} ++static inline int mptcp_rcv_synsent_state_process(struct sock *sk, ++ struct sock **skptr, ++ struct sk_buff *skb, ++ const struct mptcp_options_received *mopt) ++{ ++ return 0; ++} ++static inline bool mptcp_can_sendpage(struct sock *sk) ++{ ++ return false; ++} ++static inline int mptcp_init_tw_sock(struct sock *sk, ++ struct tcp_timewait_sock *tw) ++{ ++ return 0; ++} ++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {} ++static inline void mptcp_disconnect(struct sock *meta_sk) {} ++static inline void mptcp_tsq_flags(struct sock *sk) {} ++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {} ++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {} ++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, ++ const struct sk_buff *skb) {} ++static inline void mptcp_init_tcp_sock(struct sock *sk) {} ++static inline void mptcp_init_listen(struct sock *sk) {} ++static inline void mptcp_init_connect(struct sock *sk) {} ++static inline void mptcp_disable_static_key(void) {} ++static inline void mptcp_cookies_reqsk_init(struct request_sock *req, ++ struct mptcp_options_received *mopt, ++ struct sk_buff *skb) {} ++static inline void mptcp_mpcb_put(struct mptcp_cb *mpcb) {} ++static inline void mptcp_fin(struct sock *meta_sk) {} ++static inline bool mptcp_in_infinite_mapping_weak(const struct mptcp_cb *mpcb) ++{ ++ return false; ++} ++static inline bool mptcp_can_new_subflow(const struct sock *meta_sk) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_MPTCP */ ++ ++#endif /* _MPTCP_H */ +diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h +new file mode 100644 +index 000000000000..c58d42b11f6a +--- /dev/null ++++ b/include/net/mptcp_v4.h +@@ -0,0 +1,76 @@ ++/* ++ * MPTCP implementation ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#ifndef MPTCP_V4_H_ ++#define MPTCP_V4_H_ ++ ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern struct request_sock_ops mptcp_request_sock_ops; ++extern const struct inet_connection_sock_af_ops mptcp_v4_specific; ++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; ++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; ++ ++#ifdef CONFIG_MPTCP ++ ++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb); ++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, ++ const __be32 laddr, const struct net *net); ++int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, ++ __be16 sport, struct mptcp_rem4 *rem, ++ struct sock **subsk); ++int mptcp_pm_v4_init(void); ++void mptcp_pm_v4_undo(void); ++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); ++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, ++ u32 seed); ++ ++static inline int mptcp_init4_subsockets(struct sock *meta_sk, ++ const struct mptcp_loc4 *loc, ++ struct mptcp_rem4 *rem) ++{ ++ return __mptcp_init4_subsockets(meta_sk, loc, 0, rem, NULL); ++} ++ ++#else ++ ++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk, ++ const struct sk_buff *skb) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_MPTCP */ ++ ++#endif /* MPTCP_V4_H_ */ +diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h +new file mode 100644 +index 000000000000..93e8c87c2eb1 +--- /dev/null ++++ b/include/net/mptcp_v6.h +@@ -0,0 +1,77 @@ ++/* ++ * MPTCP implementation ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Jaakko Korkeaniemi ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#ifndef _MPTCP_V6_H ++#define _MPTCP_V6_H ++ ++#include ++#include ++ ++#include ++ ++ ++#ifdef CONFIG_MPTCP ++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped; ++extern const struct inet_connection_sock_af_ops mptcp_v6_specific; ++extern struct request_sock_ops mptcp6_request_sock_ops; ++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; ++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; ++ ++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb); ++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, ++ const struct in6_addr *laddr, const struct net *net); ++int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, ++ __be16 sport, struct mptcp_rem6 *rem, ++ struct sock **subsk); ++int mptcp_pm_v6_init(void); ++void mptcp_pm_v6_undo(void); ++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, ++ __be16 sport, __be16 dport); ++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, ++ __be16 sport, __be16 dport, u32 seed); ++ ++static inline int mptcp_init6_subsockets(struct sock *meta_sk, ++ const struct mptcp_loc6 *loc, ++ struct mptcp_rem6 *rem) ++{ ++ return __mptcp_init6_subsockets(meta_sk, loc, 0, rem, NULL); ++} ++ ++#else /* CONFIG_MPTCP */ ++ ++#define mptcp_v6_mapped ipv6_mapped ++ ++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_MPTCP */ ++ ++#endif /* _MPTCP_V6_H */ +diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h +index 167e390ac9d4..7233acfcdb4d 100644 +--- a/include/net/net_namespace.h ++++ b/include/net/net_namespace.h +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -123,6 +124,9 @@ struct net { + #if IS_ENABLED(CONFIG_IPV6) + struct netns_ipv6 ipv6; + #endif ++#if IS_ENABLED(CONFIG_MPTCP) ++ struct netns_mptcp mptcp; ++#endif + #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + struct netns_ieee802154_lowpan ieee802154_lowpan; + #endif +diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h +new file mode 100644 +index 000000000000..6680f3bbcfc8 +--- /dev/null ++++ b/include/net/netns/mptcp.h +@@ -0,0 +1,52 @@ ++/* ++ * MPTCP implementation - MPTCP namespace ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#ifndef __NETNS_MPTCP_H__ ++#define __NETNS_MPTCP_H__ ++ ++#include ++ ++enum { ++ MPTCP_PM_FULLMESH = 0, ++ MPTCP_PM_MAX ++}; ++ ++struct mptcp_mib; ++ ++struct netns_mptcp { ++ DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); ++ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *proc_net_mptcp; ++#endif ++ ++ void *path_managers[MPTCP_PM_MAX]; ++}; ++ ++#endif /* __NETNS_MPTCP_H__ */ +diff --git a/include/net/snmp.h b/include/net/snmp.h +index cb8ced4380a6..0aa0d10af2ce 100644 +--- a/include/net/snmp.h ++++ b/include/net/snmp.h +@@ -86,7 +86,6 @@ struct icmpv6msg_mib_device { + atomic_long_t mibs[ICMP6MSG_MIB_MAX]; + }; + +- + /* TCP */ + #define TCP_MIB_MAX __TCP_MIB_MAX + struct tcp_mib { +diff --git a/include/net/sock.h b/include/net/sock.h +index 079b5f6f13d8..8ae33ecd9d0a 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -821,6 +821,7 @@ enum sock_flags { + SOCK_TXTIME, + SOCK_XDP, /* XDP is attached */ + SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ ++ SOCK_MPTCP, /* MPTCP set on this socket */ + }; + + #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +@@ -1133,6 +1134,7 @@ struct proto { + void (*unhash)(struct sock *sk); + void (*rehash)(struct sock *sk); + int (*get_port)(struct sock *sk, unsigned short snum); ++ void (*clear_sk)(struct sock *sk, int size); + + /* Keeping track of sockets in use */ + #ifdef CONFIG_PROC_FS +diff --git a/include/net/tcp.h b/include/net/tcp.h +index b914959cd2c6..b290be3e510c 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); + #define TCPOPT_SACK 5 /* SACK Block */ + #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ + #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ ++#define TCPOPT_MPTCP 30 + #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ + #define TCPOPT_EXP 254 /* Experimental */ + /* Magic number to be after the option value for sharing TCP +@@ -238,6 +239,31 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); + */ + #define TFO_SERVER_WO_SOCKOPT1 0x400 + ++/* Flags from tcp_input.c for tcp_ack */ ++#define FLAG_DATA 0x01 /* Incoming frame contained data. */ ++#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ ++#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ ++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ ++#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ ++#define FLAG_DATA_SACKED 0x20 /* New SACK. */ ++#define FLAG_ECE 0x40 /* ECE in this ACK */ ++#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ ++#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ ++#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ ++#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ ++#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ ++#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ ++#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ ++#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ ++#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ ++#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ ++ ++#define MPTCP_FLAG_DATA_ACKED 0x20000 ++ ++#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) ++#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) ++#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) ++#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) + + /* sysctl variables for tcp */ + extern int sysctl_tcp_max_orphans; +@@ -310,6 +336,98 @@ extern struct proto tcp_prot; + #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) + #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) + ++/**** START - Exports needed for MPTCP ****/ ++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; ++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; ++ ++struct mptcp_options_received; ++ ++void tcp_cleanup_rbuf(struct sock *sk, int copied); ++int tcp_close_state(struct sock *sk); ++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, ++ const struct sk_buff *skb); ++int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib); ++void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb); ++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ++ gfp_t gfp_mask); ++u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now); ++unsigned int tcp_mss_split_point(const struct sock *sk, ++ const struct sk_buff *skb, ++ unsigned int mss_now, ++ unsigned int max_segs, ++ int nonagle); ++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, ++ unsigned int cur_mss, int nonagle); ++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, ++ unsigned int cur_mss); ++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb); ++int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now); ++int __pskb_trim_head(struct sk_buff *skb, int len); ++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb); ++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags); ++void tcp_reset(struct sock *sk); ++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, ++ const u32 ack_seq, const u32 nwin); ++bool tcp_urg_mode(const struct tcp_sock *tp); ++void tcp_ack_probe(struct sock *sk); ++void tcp_rearm_rto(struct sock *sk); ++int tcp_write_timeout(struct sock *sk); ++bool retransmits_timed_out(struct sock *sk, ++ unsigned int boundary, ++ unsigned int timeout); ++void tcp_write_err(struct sock *sk); ++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr); ++void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, ++ u64 prior_wstamp); ++void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now); ++ ++void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, ++ struct request_sock *req); ++void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb); ++struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb); ++void tcp_v4_reqsk_destructor(struct request_sock *req); ++ ++void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, ++ struct request_sock *req); ++void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); ++struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb); ++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); ++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); ++void tcp_v6_destroy_sock(struct sock *sk); ++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); ++void tcp_v6_hash(struct sock *sk); ++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb); ++struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, ++ struct request_sock *req, ++ struct dst_entry *dst, ++ struct request_sock *req_unhash, ++ bool *own_req); ++void tcp_v6_reqsk_destructor(struct request_sock *req); ++ ++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, ++ int large_allowed); ++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); ++void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, u32 prior_snd_una); ++ ++void skb_clone_fraglist(struct sk_buff *skb); ++ ++void inet_twsk_free(struct inet_timewait_sock *tw); ++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb); ++/* These states need RST on ABORT according to RFC793 */ ++static inline bool tcp_need_reset(int state) ++{ ++ return (1 << state) & ++ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | ++ TCPF_FIN_WAIT2 | TCPF_SYN_RECV); ++} ++ ++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, ++ bool *fragstolen); ++void tcp_ofo_queue(struct sock *sk); ++void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb); ++int linear_payload_sz(bool first_skb); ++/**** END - Exports needed for MPTCP ****/ ++ + void tcp_tasklet_init(void); + + int tcp_v4_err(struct sk_buff *skb, u32); +@@ -411,7 +529,9 @@ int tcp_mmap(struct file *file, struct socket *sock, + #endif + void tcp_parse_options(const struct net *net, const struct sk_buff *skb, + struct tcp_options_received *opt_rx, +- int estab, struct tcp_fastopen_cookie *foc); ++ struct mptcp_options_received *mopt_rx, ++ int estab, struct tcp_fastopen_cookie *foc, ++ struct tcp_sock *tp); + const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); + + /* +@@ -430,6 +550,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + + void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); + void tcp_v4_mtu_reduced(struct sock *sk); ++void tcp_v6_mtu_reduced(struct sock *sk); + void tcp_req_err(struct sock *sk, u32 seq, bool abort); + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); + struct sock *tcp_create_openreq_child(const struct sock *sk, +@@ -453,6 +574,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type); ++void tcp_reset_vars(struct sock *sk); + int tcp_disconnect(struct sock *sk, int flags); + + void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); +@@ -462,6 +584,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); + /* From syncookies.c */ + struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, ++ const struct mptcp_options_received *mopt, + struct dst_entry *dst, u32 tsoff); + int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, + u32 cookie); +@@ -547,7 +670,8 @@ static inline u32 tcp_cookie_time(void) + + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, + u16 *mssp); +-__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); ++__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mss); + u64 cookie_init_timestamp(struct request_sock *req); + bool cookie_timestamp_decode(const struct net *net, + struct tcp_options_received *opt); +@@ -561,7 +685,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); + + u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, u16 *mssp); +-__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); ++__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mss); + #endif + /* tcp_output.c */ + +@@ -597,10 +722,16 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); + void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb); + ++u16 tcp_select_window(struct sock *sk); ++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ++ int push_one, gfp_t gfp); ++ + /* tcp_input.c */ + void tcp_rearm_rto(struct sock *sk); + void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); + void tcp_reset(struct sock *sk); ++void tcp_set_rto(struct sock *sk); ++bool tcp_should_expand_sndbuf(const struct sock *sk); + void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); + void tcp_fin(struct sock *sk); + +@@ -645,7 +776,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) + } + + /* tcp.c */ +-void tcp_get_info(struct sock *, struct tcp_info *); ++void tcp_get_info(struct sock *, struct tcp_info *, bool no_lock); + + /* Read 'sendfile()'-style from a TCP socket */ + int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, +@@ -723,7 +854,7 @@ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) + * Rcv_nxt can be after the window if our peer push more data + * than the offered window. + */ +-static inline u32 tcp_receive_window(const struct tcp_sock *tp) ++static inline u32 tcp_receive_window_now(const struct tcp_sock *tp) + { + s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; + +@@ -732,6 +863,32 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp) + return (u32) win; + } + ++/* right edge only moves forward, even if window shrinks due ++ * to mptcp meta ++ */ ++static inline void tcp_update_rcv_right_edge(struct tcp_sock *tp) ++{ ++ if (after(tp->rcv_wup + tp->rcv_wnd, tp->rcv_right_edge)) ++ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; ++} ++ ++/* Compute receive window which will never shrink. The way MPTCP handles ++ * the receive window can cause the effective right edge to shrink, ++ * causing valid segments to become out of window. ++ * This function should be used when checking if a segment is valid for ++ * the max right edge announced. ++ */ ++static inline u32 tcp_receive_window_no_shrink(const struct tcp_sock *tp) ++{ ++ s32 win = tp->rcv_right_edge - tp->rcv_nxt; ++ ++ win = max_t(s32, win, tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt); ++ ++ if (unlikely(win < 0)) ++ win = 0; ++ return (u32) win; ++} ++ + /* Choose a new window, without checks for shrinking, and without + * scaling applied to the result. The caller does these things + * if necessary. This is a "raw" window selection. +@@ -829,6 +986,12 @@ struct tcp_skb_cb { + u16 tcp_gso_size; + }; + }; ++ ++#ifdef CONFIG_MPTCP ++ __u8 mptcp_flags; /* flags for the MPTCP layer */ ++ __u8 dss_off; /* Number of 4-byte words until ++ * seq-number */ ++#endif + __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ + + __u8 sacked; /* State flags for SACK. */ +@@ -847,6 +1010,14 @@ struct tcp_skb_cb { + has_rxtstamp:1, /* SKB has a RX timestamp */ + unused:5; + __u32 ack_seq; /* Sequence number ACK'd */ ++ ++#ifdef CONFIG_MPTCP ++ union { /* For MPTCP outgoing frames */ ++ __u32 path_mask; /* paths that tried to send this skb */ ++ __u32 dss[6]; /* DSS options */ ++ }; ++#endif ++ + union { + struct { + /* There is space for up to 24 bytes */ +@@ -1088,6 +1259,8 @@ void tcp_get_allowed_congestion_control(char *buf, size_t len); + int tcp_set_allowed_congestion_control(char *allowed); + int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin); ++int __tcp_set_congestion_control(struct sock *sk, const char *name, bool load, ++ bool reinit, bool cap_net_admin); + u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); + void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); + +@@ -1389,6 +1562,19 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) + space - (space>>tcp_adv_win_scale); + } + ++#ifdef CONFIG_MPTCP ++extern struct static_key mptcp_static_key; ++static inline bool mptcp(const struct tcp_sock *tp) ++{ ++ return static_key_false(&mptcp_static_key) && tp->mpc; ++} ++#else ++static inline bool mptcp(const struct tcp_sock *tp) ++{ ++ return 0; ++} ++#endif ++ + /* Note: caller must be prepared to deal with negative returns */ + static inline int tcp_space(const struct sock *sk) + { +@@ -1981,6 +2167,30 @@ struct tcp_sock_af_ops { + #endif + }; + ++/* TCP/MPTCP-specific functions */ ++struct tcp_sock_ops { ++ u32 (*__select_window)(struct sock *sk); ++ u16 (*select_window)(struct sock *sk); ++ void (*select_initial_window)(const struct sock *sk, int __space, ++ __u32 mss, __u32 *rcv_wnd, ++ __u32 *window_clamp, int wscale_ok, ++ __u8 *rcv_wscale, __u32 init_rcv_wnd); ++ void (*init_buffer_space)(struct sock *sk); ++ void (*set_rto)(struct sock *sk); ++ bool (*should_expand_sndbuf)(const struct sock *sk); ++ void (*send_fin)(struct sock *sk); ++ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle, ++ int push_one, gfp_t gfp); ++ void (*send_active_reset)(struct sock *sk, gfp_t priority); ++ int (*write_wakeup)(struct sock *sk, int mib); ++ void (*retransmit_timer)(struct sock *sk); ++ void (*time_wait)(struct sock *sk, int state, int timeo); ++ void (*cleanup_rbuf)(struct sock *sk, int copied); ++ int (*set_cong_ctrl)(struct sock *sk, const char *name, bool load, ++ bool reinit, bool cap_net_admin); ++}; ++extern const struct tcp_sock_ops tcp_specific; ++ + struct tcp_request_sock_ops { + u16 mss_clamp; + #ifdef CONFIG_TCP_MD5SIG +@@ -1991,12 +2201,13 @@ struct tcp_request_sock_ops { + const struct sock *sk, + const struct sk_buff *skb); + #endif +- void (*init_req)(struct request_sock *req, +- const struct sock *sk_listener, +- struct sk_buff *skb); ++ int (*init_req)(struct request_sock *req, ++ const struct sock *sk_listener, ++ struct sk_buff *skb, ++ bool want_cookie); + #ifdef CONFIG_SYN_COOKIES +- __u32 (*cookie_init_seq)(const struct sk_buff *skb, +- __u16 *mss); ++ __u32 (*cookie_init_seq)(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mss); + #endif + struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, + const struct request_sock *req); +@@ -2010,15 +2221,17 @@ struct tcp_request_sock_ops { + + #ifdef CONFIG_SYN_COOKIES + static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, ++ struct request_sock *req, + const struct sock *sk, struct sk_buff *skb, + __u16 *mss) + { + tcp_synq_overflow(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); +- return ops->cookie_init_seq(skb, mss); ++ return ops->cookie_init_seq(req, sk, skb, mss); + } + #else + static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, ++ struct request_sock *req, + const struct sock *sk, struct sk_buff *skb, + __u16 *mss) + { +diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h +index cc00118acca1..11084091e798 100644 +--- a/include/net/tcp_states.h ++++ b/include/net/tcp_states.h +@@ -22,6 +22,7 @@ enum { + TCP_LISTEN, + TCP_CLOSING, /* Now a valid state */ + TCP_NEW_SYN_RECV, ++ TCP_RST_WAIT, + + TCP_MAX_STATES /* Leave at the end! */ + }; +@@ -43,6 +44,7 @@ enum { + TCPF_LISTEN = (1 << TCP_LISTEN), + TCPF_CLOSING = (1 << TCP_CLOSING), + TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), ++ TCPF_RST_WAIT = (1 << TCP_RST_WAIT), + }; + + #endif /* _LINUX_TCP_STATES_H */ +diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h +index a8f6020f1196..5e70b086fdfb 100644 +--- a/include/net/transp_v6.h ++++ b/include/net/transp_v6.h +@@ -58,6 +58,8 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, + + /* address family specific functions */ + extern const struct inet_connection_sock_af_ops ipv4_specific; ++extern const struct inet_connection_sock_af_ops ipv6_mapped; ++extern const struct inet_connection_sock_af_ops ipv6_specific; + + void inet6_destroy_sock(struct sock *sk); + +diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h +index cf97f6339acb..cf48dc87a734 100644 +--- a/include/trace/events/tcp.h ++++ b/include/trace/events/tcp.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + + #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ +@@ -181,6 +182,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, + TP_ARGS(sk) + ); + ++DEFINE_EVENT(tcp_event_sk_skb, mptcp_retransmit, ++ ++ TP_PROTO(const struct sock *sk, const struct sk_buff *skb), ++ ++ TP_ARGS(sk, skb) ++); ++ + TRACE_EVENT(tcp_retransmit_synack, + + TP_PROTO(const struct sock *sk, const struct request_sock *req), +@@ -248,6 +256,7 @@ TRACE_EVENT(tcp_probe, + __field(__u32, srtt) + __field(__u32, rcv_wnd) + __field(__u64, sock_cookie) ++ __field(__u8, mptcp) + ), + + TP_fast_assign( +@@ -274,13 +283,15 @@ TRACE_EVENT(tcp_probe, + __entry->ssthresh = tcp_current_ssthresh(sk); + __entry->srtt = tp->srtt_us >> 3; + __entry->sock_cookie = sock_gen_cookie(sk); ++ __entry->mptcp = mptcp(tp) ? tp->mptcp->path_index : 0; + ), + +- TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", ++ TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx mptcp=%d", + __entry->saddr, __entry->daddr, __entry->mark, + __entry->data_len, __entry->snd_nxt, __entry->snd_una, + __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, +- __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ++ __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie, ++ __entry->mptcp) + ); + + #endif /* _TRACE_TCP_H */ +diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h +index 63038eb23560..7150eb62db86 100644 +--- a/include/uapi/linux/bpf.h ++++ b/include/uapi/linux/bpf.h +@@ -3438,6 +3438,7 @@ enum { + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, ++ BPF_TCP_RST_WAIT, + + BPF_TCP_MAX_STATES /* Leave at the end! */ + }; +diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h +index 7fea0fd7d6f5..7255e08393db 100644 +--- a/include/uapi/linux/if.h ++++ b/include/uapi/linux/if.h +@@ -132,6 +132,9 @@ enum net_device_flags { + #define IFF_ECHO IFF_ECHO + #endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ + ++#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */ ++#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */ ++ + #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ + IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) + +diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h +index 60e1241d4b77..ff6185b1d79f 100644 +--- a/include/uapi/linux/in.h ++++ b/include/uapi/linux/in.h +@@ -76,6 +76,8 @@ enum { + #define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_RAW = 255, /* Raw IP packets */ + #define IPPROTO_RAW IPPROTO_RAW ++ IPPROTO_MPTCP = 262, /* Multipath TCP connection */ ++#define IPPROTO_MPTCP IPPROTO_MPTCP + IPPROTO_MAX + }; + #endif +diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h +new file mode 100644 +index 000000000000..02078c80c846 +--- /dev/null ++++ b/include/uapi/linux/mptcp.h +@@ -0,0 +1,151 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * Netlink API for Multipath TCP ++ * ++ * Author: Gregory Detal ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#ifndef _LINUX_MPTCP_H ++#define _LINUX_MPTCP_H ++ ++#define MPTCP_GENL_NAME "mptcp" ++#define MPTCP_GENL_EV_GRP_NAME "mptcp_events" ++#define MPTCP_GENL_CMD_GRP_NAME "mptcp_commands" ++#define MPTCP_GENL_VER 0x1 ++ ++/* ++ * ATTR types defined for MPTCP ++ */ ++enum { ++ MPTCP_ATTR_UNSPEC = 0, ++ ++ MPTCP_ATTR_TOKEN, /* u32 */ ++ MPTCP_ATTR_FAMILY, /* u16 */ ++ MPTCP_ATTR_LOC_ID, /* u8 */ ++ MPTCP_ATTR_REM_ID, /* u8 */ ++ MPTCP_ATTR_SADDR4, /* u32 */ ++ MPTCP_ATTR_SADDR6, /* struct in6_addr */ ++ MPTCP_ATTR_DADDR4, /* u32 */ ++ MPTCP_ATTR_DADDR6, /* struct in6_addr */ ++ MPTCP_ATTR_SPORT, /* u16 */ ++ MPTCP_ATTR_DPORT, /* u16 */ ++ MPTCP_ATTR_BACKUP, /* u8 */ ++ MPTCP_ATTR_ERROR, /* u8 */ ++ MPTCP_ATTR_FLAGS, /* u16 */ ++ MPTCP_ATTR_TIMEOUT, /* u32 */ ++ MPTCP_ATTR_IF_IDX, /* s32 */ ++ ++ __MPTCP_ATTR_AFTER_LAST ++}; ++ ++#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) ++ ++/* ++ * Events generated by MPTCP: ++ * - MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, ++ * sport, dport ++ * A new connection has been created. It is the good time to allocate ++ * memory and send ADD_ADDR if needed. Depending on the traffic-patterns ++ * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. ++ * ++ * - MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, ++ * sport, dport ++ * A connection is established (can start new subflows). ++ * ++ * - MPTCP_EVENT_CLOSED: token ++ * A connection has stopped. ++ * ++ * - MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] ++ * A new address has been announced by the peer. ++ * ++ * - MPTCP_EVENT_REMOVED: token, rem_id ++ * An address has been lost by the peer. ++ * ++ * - MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, ++ * saddr4 | saddr6, daddr4 | daddr6, sport, ++ * dport, backup, if_idx [, error] ++ * A new subflow has been established. 'error' should not be set. ++ * ++ * - MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, ++ * daddr4 | daddr6, sport, dport, backup, if_idx ++ * [, error] ++ * A subflow has been closed. An error (copy of sk_err) could be set if an ++ * error has been detected for this subflow. ++ * ++ * - MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, ++ * daddr4 | daddr6, sport, dport, backup, if_idx ++ * [, error] ++ * The priority of a subflow has changed. 'error' should not be set. ++ * ++ * Commands for MPTCP: ++ * - MPTCP_CMD_ANNOUNCE: token, loc_id, family, saddr4 | saddr6 [, sport] ++ * Announce a new address to the peer. ++ * ++ * - MPTCP_CMD_REMOVE: token, loc_id ++ * Announce that an address has been lost to the peer. ++ * ++ * - MPTCP_CMD_SUB_CREATE: token, family, loc_id, rem_id, daddr4 | daddr6, ++ * dport [, saddr4 | saddr6, sport, backup, if_idx] ++ * Create a new subflow. ++ * ++ * - MPTCP_CMD_SUB_DESTROY: token, family, saddr4 | saddr6, daddr4 | daddr6, ++ * sport, dport ++ * Close a subflow. ++ * ++ * - MPTCP_CMD_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6, ++ * sport, dport, backup ++ * Change the priority of a subflow. ++ * ++ * - MPTCP_CMD_SET_FILTER: flags ++ * Set the filter on events. Set MPTCPF_* flags to only receive specific ++ * events. Default is to receive all events. ++ * ++ * - MPTCP_CMD_EXIST: token ++ * Check if this token is linked to an existing socket. ++ */ ++enum { ++ MPTCP_CMD_UNSPEC = 0, ++ ++ MPTCP_EVENT_CREATED, ++ MPTCP_EVENT_ESTABLISHED, ++ MPTCP_EVENT_CLOSED, ++ ++ MPTCP_CMD_ANNOUNCE, ++ MPTCP_CMD_REMOVE, ++ MPTCP_EVENT_ANNOUNCED, ++ MPTCP_EVENT_REMOVED, ++ ++ MPTCP_CMD_SUB_CREATE, ++ MPTCP_CMD_SUB_DESTROY, ++ MPTCP_EVENT_SUB_ESTABLISHED, ++ MPTCP_EVENT_SUB_CLOSED, ++ ++ MPTCP_CMD_SUB_PRIORITY, ++ MPTCP_EVENT_SUB_PRIORITY, ++ ++ MPTCP_CMD_SET_FILTER, ++ ++ MPTCP_CMD_EXIST, ++ ++ __MPTCP_CMD_AFTER_LAST ++}; ++ ++#define MPTCP_CMD_MAX (__MPTCP_CMD_AFTER_LAST - 1) ++ ++enum { ++ MPTCPF_EVENT_CREATED = (1 << 1), ++ MPTCPF_EVENT_ESTABLISHED = (1 << 2), ++ MPTCPF_EVENT_CLOSED = (1 << 3), ++ MPTCPF_EVENT_ANNOUNCED = (1 << 4), ++ MPTCPF_EVENT_REMOVED = (1 << 5), ++ MPTCPF_EVENT_SUB_ESTABLISHED = (1 << 6), ++ MPTCPF_EVENT_SUB_CLOSED = (1 << 7), ++ MPTCPF_EVENT_SUB_PRIORITY = (1 << 8), ++}; ++ ++#endif /* _LINUX_MPTCP_H */ +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index 81e697978e8b..09ef515261d2 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -18,9 +18,15 @@ + #ifndef _UAPI_LINUX_TCP_H + #define _UAPI_LINUX_TCP_H + +-#include ++#ifndef __KERNEL__ ++#include ++#endif ++ + #include ++#include ++#include + #include ++#include + + struct tcphdr { + __be16 source; +@@ -134,6 +140,13 @@ enum { + #define TCP_REPAIR_OFF 0 + #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + ++#define MPTCP_ENABLED 42 ++#define MPTCP_SCHEDULER 43 ++#define MPTCP_PATH_MANAGER 44 ++#define MPTCP_INFO 45 ++ ++#define MPTCP_INFO_FLAG_SAVE_MASTER 0x01 ++ + struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +@@ -305,6 +318,53 @@ enum { + TCP_NLA_SRTT, /* smoothed RTT in usecs */ + }; + ++struct mptcp_meta_info { ++ __u8 mptcpi_state; ++ __u8 mptcpi_retransmits; ++ __u8 mptcpi_probes; ++ __u8 mptcpi_backoff; ++ ++ __u32 mptcpi_rto; ++ __u32 mptcpi_unacked; ++ ++ /* Times. */ ++ __u32 mptcpi_last_data_sent; ++ __u32 mptcpi_last_data_recv; ++ __u32 mptcpi_last_ack_recv; ++ ++ __u32 mptcpi_total_retrans; ++ ++ __u64 mptcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ ++ __u64 mptcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ ++}; ++ ++struct mptcp_sub_info { ++ union { ++ struct sockaddr src; ++ struct sockaddr_in src_v4; ++ struct sockaddr_in6 src_v6; ++ }; ++ ++ union { ++ struct sockaddr dst; ++ struct sockaddr_in dst_v4; ++ struct sockaddr_in6 dst_v6; ++ }; ++}; ++ ++struct mptcp_info { ++ __u32 tcp_info_len; /* Length of each struct tcp_info in subflows pointer */ ++ __u32 sub_len; /* Total length of memory pointed to by subflows pointer */ ++ __u32 meta_len; /* Length of memory pointed to by meta_info */ ++ __u32 sub_info_len; /* Length of each struct mptcp_sub_info in subflow_info pointer */ ++ __u32 total_sub_info_len; /* Total length of memory pointed to by subflow_info */ ++ ++ struct mptcp_meta_info *meta_info; ++ struct tcp_info *initial; ++ struct tcp_info *subflows; /* Pointer to array of tcp_info structs */ ++ struct mptcp_sub_info *subflow_info; ++}; ++ + /* for TCP_MD5SIG socket option */ + #define TCP_MD5SIG_MAXKEYLEN 80 + +diff --git a/net/Kconfig b/net/Kconfig +index 0b2fecc83452..66f9158a3040 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -94,6 +94,7 @@ if INET + source "net/ipv4/Kconfig" + source "net/ipv6/Kconfig" + source "net/netlabel/Kconfig" ++source "net/mptcp/Kconfig" + + endif # if INET + +diff --git a/net/Makefile b/net/Makefile +index 449fc0b221f8..08683343642e 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/ + obj-$(CONFIG_XFRM) += xfrm/ + obj-$(CONFIG_UNIX_SCM) += unix/ + obj-$(CONFIG_NET) += ipv6/ ++obj-$(CONFIG_MPTCP) += mptcp/ + obj-$(CONFIG_BPFILTER) += bpfilter/ + obj-$(CONFIG_PACKET) += packet/ + obj-$(CONFIG_NET_KEY) += key/ +diff --git a/net/core/dev.c b/net/core/dev.c +index 3810eaf89b26..a8a1fba9b4ec 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -7880,7 +7880,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | +- IFF_AUTOMEDIA)) | ++ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + +diff --git a/net/core/filter.c b/net/core/filter.c +index 0e161a6dff7e..431996bd5a16 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -73,6 +73,7 @@ + #include + #include + #include ++#include + + /** + * sk_filter_trim_cap - run a packet through a socket filter +@@ -4280,6 +4281,19 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + if (sk->sk_mark != val) { + sk->sk_mark = val; + sk_dst_reset(sk); ++ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (val != sk_it->sk_mark) { ++ sk_it->sk_mark = val; ++ sk_dst_reset(sk_it); ++ } ++ } ++ } + } + break; + default: +@@ -4302,6 +4316,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + if (val == -1) + val = 0; + inet->tos = val; ++ ++ /* Update TOS on mptcp subflow */ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) ++ inet_sk(mptcp_to_sock(mptcp))->tos = val; ++ } + } + break; + default: +@@ -4324,6 +4346,17 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + if (val == -1) + val = 0; + np->tclass = val; ++ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (sk_it->sk_family == AF_INET6) ++ inet6_sk(sk_it)->tclass = val; ++ } ++ } + } + break; + default: +diff --git a/net/core/net-traces.c b/net/core/net-traces.c +index 283ddb2dbc7d..8f526a0d1912 100644 +--- a/net/core/net-traces.c ++++ b/net/core/net-traces.c +@@ -60,3 +60,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); + + EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); ++ ++EXPORT_TRACEPOINT_SYMBOL_GPL(mptcp_retransmit); +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index 7dba091bc861..79ed7efe1c0c 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -582,7 +582,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb) + skb_drop_list(&skb_shinfo(skb)->frag_list); + } + +-static void skb_clone_fraglist(struct sk_buff *skb) ++void skb_clone_fraglist(struct sk_buff *skb) + { + struct sk_buff *list; + +diff --git a/net/core/sock.c b/net/core/sock.c +index 57b7a10703c3..8d716113e273 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -135,6 +135,11 @@ + + #include + ++#ifdef CONFIG_MPTCP ++#include ++#include ++#endif ++ + #include + #include + +@@ -1063,6 +1068,19 @@ int sock_setsockopt(struct socket *sock, int level, int optname, + } else if (val != sk->sk_mark) { + sk->sk_mark = val; + sk_dst_reset(sk); ++ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (val != sk_it->sk_mark) { ++ sk_it->sk_mark = val; ++ sk_dst_reset(sk_it); ++ } ++ } ++ } + } + break; + +@@ -1563,6 +1581,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, + */ + static inline void sock_lock_init(struct sock *sk) + { ++#ifdef CONFIG_MPTCP ++ /* Reclassify the lock-class for subflows */ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) ++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) { ++ sock_lock_init_class_and_name(sk, meta_slock_key_name, ++ &meta_slock_key, ++ meta_key_name, ++ &meta_key); ++ ++ /* We don't yet have the mptcp-point. ++ * Thus we still need inet_sock_destruct ++ */ ++ sk->sk_destruct = inet_sock_destruct; ++ return; ++ } ++#endif ++ + if (sk->sk_kern_sock) + sock_lock_init_class_and_name( + sk, +@@ -1611,8 +1646,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; +- if (want_init_on_alloc(priority)) +- sk_prot_clear_nulls(sk, prot->obj_size); ++ if (want_init_on_alloc(priority)) { ++ if (prot->clear_sk) ++ prot->clear_sk(sk, prot->obj_size); ++ else ++ sk_prot_clear_nulls(sk, prot->obj_size); ++ } + } else + sk = kmalloc(prot->obj_size, priority); + +@@ -1846,6 +1885,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) + atomic_set(&newsk->sk_zckey, 0); + + sock_reset_flag(newsk, SOCK_DONE); ++ sock_reset_flag(newsk, SOCK_MPTCP); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index a926de2e42b5..6d73dc6e2586 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -655,6 +655,51 @@ config TCP_CONG_BBR + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. + ++config TCP_CONG_LIA ++ tristate "MPTCP Linked Increase" ++ depends on MPTCP ++ default n ++ ---help--- ++ MultiPath TCP Linked Increase Congestion Control ++ To enable it, just put 'lia' in tcp_congestion_control ++ ++config TCP_CONG_OLIA ++ tristate "MPTCP Opportunistic Linked Increase" ++ depends on MPTCP ++ default n ++ ---help--- ++ MultiPath TCP Opportunistic Linked Increase Congestion Control ++ To enable it, just put 'olia' in tcp_congestion_control ++ ++config TCP_CONG_WVEGAS ++ tristate "MPTCP WVEGAS CONGESTION CONTROL" ++ depends on MPTCP ++ default n ++ ---help--- ++ wVegas congestion control for MPTCP ++ To enable it, just put 'wvegas' in tcp_congestion_control ++ ++config TCP_CONG_BALIA ++ tristate "MPTCP BALIA CONGESTION CONTROL" ++ depends on MPTCP ++ default n ++ ---help--- ++ Multipath TCP Balanced Linked Adaptation Congestion Control ++ To enable it, just put 'balia' in tcp_congestion_control ++ ++config TCP_CONG_MCTCPDESYNC ++ tristate "DESYNCHRONIZED MCTCP CONGESTION CONTROL (EXPERIMENTAL)" ++ depends on MPTCP ++ default n ++ ---help--- ++ Desynchronized MultiChannel TCP Congestion Control. This is experimental ++ code that only supports single path and must have set mptcp_ndiffports ++ larger than one. ++ To enable it, just put 'mctcpdesync' in tcp_congestion_control ++ For further details see: ++ http://ieeexplore.ieee.org/abstract/document/6911722/ ++ https://doi.org/10.1016/j.comcom.2015.07.010 ++ + choice + prompt "Default TCP congestion control" + default DEFAULT_CUBIC +@@ -692,6 +737,21 @@ choice + config DEFAULT_BBR + bool "BBR" if TCP_CONG_BBR=y + ++ config DEFAULT_LIA ++ bool "Lia" if TCP_CONG_LIA=y ++ ++ config DEFAULT_OLIA ++ bool "Olia" if TCP_CONG_OLIA=y ++ ++ config DEFAULT_WVEGAS ++ bool "Wvegas" if TCP_CONG_WVEGAS=y ++ ++ config DEFAULT_BALIA ++ bool "Balia" if TCP_CONG_BALIA=y ++ ++ config DEFAULT_MCTCPDESYNC ++ bool "Mctcpdesync (EXPERIMENTAL)" if TCP_CONG_MCTCPDESYNC=y ++ + config DEFAULT_RENO + bool "Reno" + endchoice +@@ -712,6 +772,10 @@ config DEFAULT_TCP_CONG + default "vegas" if DEFAULT_VEGAS + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO ++ default "lia" if DEFAULT_LIA ++ default "olia" if DEFAULT_OLIA ++ default "wvegas" if DEFAULT_WVEGAS ++ default "balia" if DEFAULT_BALIA + default "reno" if DEFAULT_RENO + default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 70f92aaca411..0f4633257c75 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -100,6 +100,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -150,6 +151,9 @@ void inet_sock_destruct(struct sock *sk) + return; + } + ++ if (sock_flag(sk, SOCK_MPTCP)) ++ mptcp_disable_static_key(); ++ + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(refcount_read(&sk->sk_wmem_alloc)); + WARN_ON(sk->sk_wmem_queued); +@@ -227,6 +231,8 @@ int inet_listen(struct socket *sock, int backlog) + tcp_fastopen_init_key_once(sock_net(sk)); + } + ++ mptcp_init_listen(sk); ++ + err = inet_csk_listen_start(sk, backlog); + if (err) + goto out; +@@ -244,8 +250,7 @@ EXPORT_SYMBOL(inet_listen); + * Create an inet socket. + */ + +-static int inet_create(struct net *net, struct socket *sock, int protocol, +- int kern) ++int inet_create(struct net *net, struct socket *sock, int protocol, int kern) + { + struct sock *sk; + struct inet_protosw *answer; +@@ -739,6 +744,24 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, + lock_sock(sk2); + + sock_rps_record_flow(sk2); ++ ++ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk2)->mpcb, mptcp) { ++ sock_rps_record_flow(mptcp_to_sock(mptcp)); ++ } ++ ++ if (tcp_sk(sk2)->mpcb->master_sk) { ++ struct sock *sk_it = tcp_sk(sk2)->mpcb->master_sk; ++ ++ write_lock_bh(&sk_it->sk_callback_lock); ++ rcu_assign_pointer(sk_it->sk_wq, &newsock->wq); ++ sk_it->sk_socket = newsock; ++ write_unlock_bh(&sk_it->sk_callback_lock); ++ } ++ } ++ + WARN_ON(!((1 << sk2->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_RECV | + TCPF_CLOSE_WAIT | TCPF_CLOSE))); +@@ -1974,6 +1997,9 @@ static int __init inet_init(void) + + ip_init(); + ++ /* We must initialize MPTCP before TCP. */ ++ mptcp_init(); ++ + /* Setup TCP slab cache for open requests. */ + tcp_init(); + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index 85a88425edc4..f3de2d6eb1a4 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -730,7 +731,10 @@ static void reqsk_timer_handler(struct timer_list *t) + int max_retries, thresh; + u8 defer_accept; + +- if (inet_sk_state_load(sk_listener) != TCP_LISTEN) ++ if (!is_meta_sk(sk_listener) && inet_sk_state_load(sk_listener) != TCP_LISTEN) ++ goto drop; ++ ++ if (is_meta_sk(sk_listener) && !mptcp_can_new_subflow(sk_listener)) + goto drop; + + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; +@@ -819,7 +823,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, + const struct request_sock *req, + const gfp_t priority) + { +- struct sock *newsk = sk_clone_lock(sk, priority); ++ struct sock *newsk; ++ ++ newsk = sk_clone_lock(sk, priority); + + if (newsk) { + struct inet_connection_sock *newicsk = inet_csk(newsk); +@@ -1019,7 +1025,14 @@ void inet_csk_listen_stop(struct sock *sk) + */ + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { + struct sock *child = req->sk; ++ bool mutex_taken = false; ++ struct mptcp_cb *mpcb = tcp_sk(child)->mpcb; + ++ if (is_meta_sk(child)) { ++ WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0); ++ mutex_lock(&mpcb->mpcb_mutex); ++ mutex_taken = true; ++ } + local_bh_disable(); + bh_lock_sock(child); + WARN_ON(sock_owned_by_user(child)); +@@ -1029,6 +1042,10 @@ void inet_csk_listen_stop(struct sock *sk) + reqsk_put(req); + bh_unlock_sock(child); + local_bh_enable(); ++ if (mutex_taken) { ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ } + sock_put(child); + + cond_resched(); +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index aa3fd61818c4..8b3e955ec165 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -44,6 +44,8 @@ + #endif + #include + ++#include ++ + #include + #include + +@@ -657,7 +659,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, + break; + old = rcu_dereference_protected(inet->inet_opt, + lockdep_sock_is_held(sk)); +- if (inet->is_icsk) { ++ if (inet->is_icsk && !is_meta_sk(sk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + #if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == PF_INET || +@@ -751,6 +753,20 @@ static int do_ip_setsockopt(struct sock *sk, int level, + inet->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); ++ /* Update TOS on mptcp subflow */ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (inet_sk(sk_it)->tos != inet_sk(sk)->tos) { ++ inet_sk(sk_it)->tos = inet_sk(sk)->tos; ++ sk_it->sk_priority = sk->sk_priority; ++ sk_dst_reset(sk_it); ++ } ++ } ++ } + } + break; + case IP_TTL: +diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c +index 2b45d1455592..f988be944eda 100644 +--- a/net/ipv4/syncookies.c ++++ b/net/ipv4/syncookies.c +@@ -12,6 +12,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -175,7 +177,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, + } + EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); + +-__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) ++__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mssp) + { + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); +@@ -200,14 +203,33 @@ EXPORT_SYMBOL_GPL(__cookie_v4_check); + + struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, ++ const struct mptcp_options_received *mopt, + struct dst_entry *dst, u32 tsoff) + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *child; + bool own_req; ++#ifdef CONFIG_MPTCP ++ int ret; ++#endif + + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + NULL, &own_req); ++ ++#ifdef CONFIG_MPTCP ++ if (!child) ++ goto listen_overflow; ++ ++ ret = mptcp_check_req_master(sk, child, req, skb, mopt, 0, tsoff); ++ if (ret < 0) ++ return NULL; ++ ++ if (!ret) ++ return tcp_sk(child)->mpcb->master_sk; ++ ++listen_overflow: ++#endif ++ + if (child) { + refcount_set(&req->rsk_refcnt, 1); + tcp_sk(child)->tsoffset = tsoff; +@@ -284,6 +306,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + { + struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; + struct tcp_options_received tcp_opt; ++ struct mptcp_options_received mopt; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct tcp_sock *tp = tcp_sk(sk); +@@ -313,7 +336,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); +- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); ++ mptcp_init_mp_opt(&mopt); ++ tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL); + + if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { + tsoff = secure_tcp_ts_off(sock_net(sk), +@@ -326,7 +350,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + goto out; + + ret = NULL; +- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ ++#ifdef CONFIG_MPTCP ++ if (mopt.saw_mpc) ++ req = inet_reqsk_alloc(&mptcp_request_sock_ops, sk, false); /* for safety */ ++ else ++#endif ++ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ + if (!req) + goto out; + +@@ -346,6 +375,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; ++ ireq->mptcp_rqsk = 0; ++ ireq->saw_mpc = 0; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->snt_synack = 0; + treq->tfo_listener = false; +@@ -354,6 +385,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + + ireq->ir_iif = inet_request_bound_dev_if(sk, skb); + ++ if (mopt.saw_mpc) ++ mptcp_cookies_reqsk_init(req, &mopt, skb); ++ + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ +@@ -392,15 +426,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + +- tcp_select_initial_window(sk, full_space, req->mss, +- &req->rsk_rcv_wnd, &req->rsk_window_clamp, +- ireq->wscale_ok, &rcv_wscale, +- dst_metric(&rt->dst, RTAX_INITRWND)); ++ tp->ops->select_initial_window(sk, full_space, req->mss, ++ &req->rsk_rcv_wnd, &req->rsk_window_clamp, ++ ireq->wscale_ok, &rcv_wscale, ++ dst_metric(&rt->dst, RTAX_INITRWND)); + + ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); + +- ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff); ++ ret = tcp_get_cookie_sock(sk, skb, req, &mopt, &rt->dst, tsoff); + /* ip_queue_xmit() depends on our flow being setup + * Normal sockets get it right from inet_csk_route_child_sock() + */ +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 5c8d0fb49825..19fbc2cebb07 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -270,6 +270,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -400,6 +401,23 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) + return rate64; + } + ++const struct tcp_sock_ops tcp_specific = { ++ .__select_window = __tcp_select_window, ++ .select_window = tcp_select_window, ++ .select_initial_window = tcp_select_initial_window, ++ .init_buffer_space = tcp_init_buffer_space, ++ .set_rto = tcp_set_rto, ++ .should_expand_sndbuf = tcp_should_expand_sndbuf, ++ .send_fin = tcp_send_fin, ++ .write_xmit = tcp_write_xmit, ++ .send_active_reset = tcp_send_active_reset, ++ .write_wakeup = tcp_write_wakeup, ++ .retransmit_timer = tcp_retransmit_timer, ++ .time_wait = tcp_time_wait, ++ .cleanup_rbuf = tcp_cleanup_rbuf, ++ .set_cong_ctrl = __tcp_set_congestion_control, ++}; ++ + /* Address-family independent initialization for a tcp_sock. + * + * NOTE: A lot of things set to zero explicitly by call to +@@ -453,6 +471,11 @@ void tcp_init_sock(struct sock *sk) + WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + ++ tp->ops = &tcp_specific; ++ ++ /* Initialize MPTCP-specific stuff and function-pointers */ ++ mptcp_init_tcp_sock(sk); ++ + sk_sockets_allocated_inc(sk); + sk->sk_route_forced_caps = NETIF_F_GSO; + } +@@ -484,7 +507,7 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + return true; + if (tcp_rmem_pressure(sk)) + return true; +- if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) ++ if (tcp_receive_window_now(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) + return true; + } + if (sk->sk_prot->stream_memory_read) +@@ -787,6 +810,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, + int ret; + + sock_rps_record_flow(sk); ++ + /* + * We can't seek on a socket input + */ +@@ -797,6 +821,16 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, + + lock_sock(sk); + ++#ifdef CONFIG_MPTCP ++ if (mptcp(tcp_sk(sk))) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ sock_rps_record_flow(mptcp_to_sock(mptcp)); ++ } ++ } ++#endif ++ + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); + while (tss.len) { + ret = __tcp_splice_read(sk, &tss); +@@ -912,8 +946,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + return NULL; + } + +-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, +- int large_allowed) ++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) + { + struct tcp_sock *tp = tcp_sk(sk); + u32 new_size_goal, size_goal; +@@ -941,8 +974,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) + { + int mss_now; + +- mss_now = tcp_current_mss(sk); +- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); ++ if (mptcp(tcp_sk(sk))) { ++ mss_now = mptcp_current_mss(sk); ++ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); ++ } else { ++ mss_now = tcp_current_mss(sk); ++ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); ++ } + + return mss_now; + } +@@ -982,12 +1020,34 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, + * is fully established. + */ + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && +- !tcp_passive_fastopen(sk)) { ++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? ++ tp->mpcb->master_sk : sk)) { + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) + goto out_err; + } + ++ if (mptcp(tp)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ /* We must check this with socket-lock hold because we iterate ++ * over the subflows. ++ */ ++ if (!mptcp_can_sendpage(sk)) { ++ ssize_t ret; ++ ++ release_sock(sk); ++ ret = sock_no_sendpage(sk->sk_socket, page, offset, ++ size, flags); ++ lock_sock(sk); ++ return ret; ++ } ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ sock_rps_record_flow(mptcp_to_sock(mptcp)); ++ } ++ } ++ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + mss_now = tcp_send_mss(sk, &size_goal, flags); +@@ -1109,7 +1169,8 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages); + int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) + { +- if (!(sk->sk_route_caps & NETIF_F_SG)) ++ /* If MPTCP is enabled, we check it later after establishment */ ++ if (!mptcp(tcp_sk(sk)) && !(sk->sk_route_caps & NETIF_F_SG)) + return sock_no_sendpage_locked(sk, page, offset, size, flags); + + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ +@@ -1231,12 +1292,21 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) + * is fully established. + */ + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && +- !tcp_passive_fastopen(sk)) { ++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? ++ tp->mpcb->master_sk : sk)) { + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) + goto do_error; + } + ++ if (mptcp(tp)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ sock_rps_record_flow(mptcp_to_sock(mptcp)); ++ } ++ } ++ + if (unlikely(tp->repair)) { + if (tp->repair_queue == TCP_RECV_QUEUE) { + copied = tcp_send_rcvq(sk, msg, size); +@@ -1529,7 +1599,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +-static void tcp_cleanup_rbuf(struct sock *sk, int copied) ++void tcp_cleanup_rbuf(struct sock *sk, int copied) + { + struct tcp_sock *tp = tcp_sk(sk); + bool time_to_ack = false; +@@ -1568,11 +1638,11 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) + * in states, where we will not receive more. It is useless. + */ + if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { +- __u32 rcv_window_now = tcp_receive_window(tp); ++ __u32 rcv_window_now = tcp_receive_window_now(tp); + + /* Optimize, __tcp_select_window() is not cheap. */ + if (2*rcv_window_now <= tp->window_clamp) { +- __u32 new_window = __tcp_select_window(sk); ++ __u32 new_window = tp->ops->__select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. +@@ -1688,7 +1758,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + /* Clean up data we have read: This will do ACK frames. */ + if (copied > 0) { + tcp_recv_skb(sk, seq, &offset); +- tcp_cleanup_rbuf(sk, copied); ++ tp->ops->cleanup_rbuf(sk, copied); + } + return copied; + } +@@ -1979,6 +2049,16 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + + lock_sock(sk); + ++#ifdef CONFIG_MPTCP ++ if (mptcp(tp)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ sock_rps_record_flow(mptcp_to_sock(mptcp)); ++ } ++ } ++#endif ++ + err = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; +@@ -2097,7 +2177,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + } + } + +- tcp_cleanup_rbuf(sk, copied); ++ tp->ops->cleanup_rbuf(sk, copied); + + if (copied >= target) { + /* Do not sleep, just process backlog. */ +@@ -2189,7 +2269,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + */ + + /* Clean up data we have read: This will do ACK frames. */ +- tcp_cleanup_rbuf(sk, copied); ++ tp->ops->cleanup_rbuf(sk, copied); + + release_sock(sk); + +@@ -2248,8 +2328,11 @@ void tcp_set_state(struct sock *sk, int state) + + switch (state) { + case TCP_ESTABLISHED: +- if (oldstate != TCP_ESTABLISHED) ++ if (oldstate != TCP_ESTABLISHED) { + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); ++ if (is_meta_sk(sk)) ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); ++ } + break; + + case TCP_CLOSE: +@@ -2262,8 +2345,11 @@ void tcp_set_state(struct sock *sk, int state) + inet_put_port(sk); + /* fall through */ + default: +- if (oldstate == TCP_ESTABLISHED) ++ if (oldstate == TCP_ESTABLISHED) { + TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); ++ if (is_meta_sk(sk)) ++ MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); ++ } + } + + /* Change state AFTER socket is unhashed to avoid closed +@@ -2297,7 +2383,7 @@ static const unsigned char new_state[16] = { + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ + }; + +-static int tcp_close_state(struct sock *sk) ++int tcp_close_state(struct sock *sk) + { + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; +@@ -2327,7 +2413,7 @@ void tcp_shutdown(struct sock *sk, int how) + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk)) +- tcp_send_fin(sk); ++ tcp_sk(sk)->ops->send_fin(sk); + } + } + EXPORT_SYMBOL(tcp_shutdown); +@@ -2352,6 +2438,17 @@ void tcp_close(struct sock *sk, long timeout) + int data_was_unread = 0; + int state; + ++ if (is_meta_sk(sk)) { ++ /* TODO: Currently forcing timeout to 0 because ++ * sk_stream_wait_close will complain during lockdep because ++ * of the mpcb_mutex (circular lock dependency through ++ * inet_csk_listen_stop()). ++ * We should find a way to get rid of the mpcb_mutex. ++ */ ++ mptcp_close(sk, 0); ++ return; ++ } ++ + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + +@@ -2396,7 +2493,7 @@ void tcp_close(struct sock *sk, long timeout) + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + tcp_set_state(sk, TCP_CLOSE); +- tcp_send_active_reset(sk, sk->sk_allocation); ++ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation); + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); +@@ -2470,7 +2567,7 @@ void tcp_close(struct sock *sk, long timeout) + struct tcp_sock *tp = tcp_sk(sk); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); +- tcp_send_active_reset(sk, GFP_ATOMIC); ++ tp->ops->send_active_reset(sk, GFP_ATOMIC); + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPABORTONLINGER); + } else { +@@ -2480,7 +2577,8 @@ void tcp_close(struct sock *sk, long timeout) + inet_csk_reset_keepalive_timer(sk, + tmo - TCP_TIMEWAIT_LEN); + } else { +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); ++ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2, ++ tmo); + goto out; + } + } +@@ -2489,7 +2587,7 @@ void tcp_close(struct sock *sk, long timeout) + sk_mem_reclaim(sk); + if (tcp_check_oom(sk, 0)) { + tcp_set_state(sk, TCP_CLOSE); +- tcp_send_active_reset(sk, GFP_ATOMIC); ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { +@@ -2521,15 +2619,6 @@ void tcp_close(struct sock *sk, long timeout) + } + EXPORT_SYMBOL(tcp_close); + +-/* These states need RST on ABORT according to RFC793 */ +- +-static inline bool tcp_need_reset(int state) +-{ +- return (1 << state) & +- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | +- TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +-} +- + static void tcp_rtx_queue_purge(struct sock *sk) + { + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); +@@ -2551,6 +2640,10 @@ void tcp_write_queue_purge(struct sock *sk) + { + struct sk_buff *skb; + ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && ++ !tcp_rtx_and_write_queues_empty(sk)) ++ mptcp_reinject_data(sk, 0); ++ + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); +@@ -2569,6 +2662,36 @@ void tcp_write_queue_purge(struct sock *sk) + inet_csk(sk)->icsk_backoff = 0; + } + ++void tcp_reset_vars(struct sock *sk) ++{ ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ tp->srtt_us = 0; ++ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); ++ tp->rcv_rtt_last_tsecr = 0; ++ icsk->icsk_probes_tstamp = 0; ++ icsk->icsk_rto = TCP_TIMEOUT_INIT; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ tp->snd_cwnd = TCP_INIT_CWND; ++ tp->snd_cwnd_cnt = 0; ++ tp->delivered = 0; ++ tp->delivered_ce = 0; ++ tp->is_sack_reneg = 0; ++ tcp_clear_retrans(tp); ++ tp->segs_in = 0; ++ tp->segs_out = 0; ++ tp->bytes_sent = 0; ++ tp->bytes_acked = 0; ++ tp->bytes_received = 0; ++ tp->bytes_retrans = 0; ++ tp->total_retrans = 0; ++ tp->data_segs_in = 0; ++ tp->data_segs_out = 0; ++ /* There's a bubble in the pipe until at least the first ACK. */ ++ tp->app_limited = ~0U; ++} ++ + int tcp_disconnect(struct sock *sk, int flags) + { + struct inet_sock *inet = inet_sk(sk); +@@ -2591,7 +2714,7 @@ int tcp_disconnect(struct sock *sk, int flags) + /* The last check adjusts for discrepancy of Linux wrt. RFC + * states + */ +- tcp_send_active_reset(sk, gfp_any()); ++ tp->ops->send_active_reset(sk, gfp_any()); + sk->sk_err = ECONNRESET; + } else if (old_state == TCP_SYN_SENT) + sk->sk_err = ECONNRESET; +@@ -2613,11 +2736,15 @@ int tcp_disconnect(struct sock *sk, int flags) + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + ++ if (is_meta_sk(sk)) { ++ mptcp_disconnect(sk); ++ } else { ++ if (tp->inside_tk_table) ++ mptcp_hash_remove_bh(tp); ++ } ++ + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); +- tp->srtt_us = 0; +- tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); +- tp->rcv_rtt_last_tsecr = 0; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) +@@ -2627,21 +2754,14 @@ int tcp_disconnect(struct sock *sk, int flags) + icsk->icsk_backoff = 0; + tp->snd_cwnd = 2; + icsk->icsk_probes_out = 0; +- icsk->icsk_probes_tstamp = 0; +- icsk->icsk_rto = TCP_TIMEOUT_INIT; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- tp->snd_cwnd = TCP_INIT_CWND; +- tp->snd_cwnd_cnt = 0; + tp->window_clamp = 0; +- tp->delivered = 0; +- tp->delivered_ce = 0; ++ ++ tcp_reset_vars(sk); ++ + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + tcp_set_ca_state(sk, TCP_CA_Open); +- tp->is_sack_reneg = 0; +- tcp_clear_retrans(tp); +- tp->total_retrans = 0; + inet_csk_delack_init(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() +@@ -2653,14 +2773,6 @@ int tcp_disconnect(struct sock *sk, int flags) + sk->sk_rx_dst = NULL; + tcp_saved_syn_free(tp); + tp->compressed_ack = 0; +- tp->segs_in = 0; +- tp->segs_out = 0; +- tp->bytes_sent = 0; +- tp->bytes_acked = 0; +- tp->bytes_received = 0; +- tp->bytes_retrans = 0; +- tp->data_segs_in = 0; +- tp->data_segs_out = 0; + tp->duplicate_sack[0].start_seq = 0; + tp->duplicate_sack[0].end_seq = 0; + tp->dsack_dups = 0; +@@ -2669,8 +2781,6 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->sacked_out = 0; + tp->tlp_high_seq = 0; + tp->last_oow_ack_time = 0; +- /* There's a bubble in the pipe until at least the first ACK. */ +- tp->app_limited = ~0U; + tp->rack.mstamp = 0; + tp->rack.advanced = 0; + tp->rack.reo_wnd_steps = 1; +@@ -2704,7 +2814,7 @@ EXPORT_SYMBOL(tcp_disconnect); + static inline bool tcp_can_repair_sock(const struct sock *sk) + { + return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && +- (sk->sk_state != TCP_LISTEN); ++ (sk->sk_state != TCP_LISTEN) && !sock_flag(sk, SOCK_MPTCP); + } + + static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) +@@ -2735,6 +2845,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l + + tp->rcv_wnd = opt.rcv_wnd; + tp->rcv_wup = opt.rcv_wup; ++ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; + + return 0; + } +@@ -2873,6 +2984,61 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); + } ++#ifdef CONFIG_MPTCP ++ case MPTCP_SCHEDULER: { ++ char name[MPTCP_SCHED_NAME_MAX]; ++ ++ if (optlen < 1) ++ return -EINVAL; ++ ++ /* Cannot be used if MPTCP is not used or we already have ++ * established an MPTCP-connection. ++ */ ++ if (mptcp_init_failed || !sysctl_mptcp_enabled || ++ sk->sk_state != TCP_CLOSE) ++ return -EPERM; ++ ++ val = strncpy_from_user(name, optval, ++ min_t(long, MPTCP_SCHED_NAME_MAX - 1, ++ optlen)); ++ ++ if (val < 0) ++ return -EFAULT; ++ name[val] = 0; ++ ++ lock_sock(sk); ++ err = mptcp_set_scheduler(sk, name); ++ release_sock(sk); ++ return err; ++ } ++ ++ case MPTCP_PATH_MANAGER: { ++ char name[MPTCP_PM_NAME_MAX]; ++ ++ if (optlen < 1) ++ return -EINVAL; ++ ++ /* Cannot be used if MPTCP is not used or we already have ++ * established an MPTCP-connection. ++ */ ++ if (mptcp_init_failed || !sysctl_mptcp_enabled || ++ sk->sk_state != TCP_CLOSE) ++ return -EPERM; ++ ++ val = strncpy_from_user(name, optval, ++ min_t(long, MPTCP_PM_NAME_MAX - 1, ++ optlen)); ++ ++ if (val < 0) ++ return -EFAULT; ++ name[val] = 0; ++ ++ lock_sock(sk); ++ err = mptcp_set_path_manager(sk, name); ++ release_sock(sk); ++ return err; ++ } ++#endif + default: + /* fallthru */ + break; +@@ -3062,6 +3228,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + break; + + case TCP_DEFER_ACCEPT: ++ /* An established MPTCP-connection (mptcp(tp) only returns true ++ * if the socket is established) should not use DEFER on new ++ * subflows. ++ */ ++ if (mptcp(tp)) ++ break; + /* Translate value in seconds to number of retransmits */ + icsk->icsk_accept_queue.rskq_defer_accept = + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, +@@ -3089,7 +3261,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; +- tcp_cleanup_rbuf(sk, 1); ++ tp->ops->cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +@@ -3099,7 +3271,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + #ifdef CONFIG_TCP_MD5SIG + case TCP_MD5SIG: + case TCP_MD5SIG_EXT: +- err = tp->af_specific->md5_parse(sk, optname, optval, optlen); ++ if (!sock_flag(sk, SOCK_MPTCP)) ++ err = tp->af_specific->md5_parse(sk, optname, optval, optlen); ++ else ++ err = -EINVAL; + break; + #endif + case TCP_USER_TIMEOUT: +@@ -3155,6 +3330,32 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; ++#ifdef CONFIG_MPTCP ++ case MPTCP_ENABLED: ++ if (mptcp_init_failed || !sysctl_mptcp_enabled || ++ sk->sk_state != TCP_CLOSE ++#ifdef CONFIG_TCP_MD5SIG ++ || rcu_access_pointer(tp->md5sig_info) ++#endif ++ ) { ++ err = -EPERM; ++ break; ++ } ++ ++ if (val) ++ mptcp_enable_sock(sk); ++ else ++ mptcp_disable_sock(sk); ++ break; ++ case MPTCP_INFO: ++ if (mptcp_init_failed || !sysctl_mptcp_enabled) { ++ err = -EPERM; ++ break; ++ } ++ ++ tp->record_master_info = !!(val & MPTCP_INFO_FLAG_SAVE_MASTER); ++ break; ++#endif + case TCP_INQ: + if (val > 1 || val < 0) + err = -EINVAL; +@@ -3219,7 +3420,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + } + + /* Return information about state of tcp endpoint in API format. */ +-void tcp_get_info(struct sock *sk, struct tcp_info *info) ++void tcp_get_info(struct sock *sk, struct tcp_info *info, bool no_lock) + { + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -3256,7 +3457,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + return; + } + +- slow = lock_sock_fast(sk); ++ if (!no_lock) ++ slow = lock_sock_fast(sk); + + info->tcpi_ca_state = icsk->icsk_ca_state; + info->tcpi_retransmits = icsk->icsk_retransmits; +@@ -3332,7 +3534,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_reord_seen = tp->reord_seen; + info->tcpi_rcv_ooopack = tp->rcv_ooopack; + info->tcpi_snd_wnd = tp->snd_wnd; +- unlock_sock_fast(sk, slow); ++ ++ if (!no_lock) ++ unlock_sock_fast(sk, slow); + } + EXPORT_SYMBOL_GPL(tcp_get_info); + +@@ -3479,7 +3683,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, + if (get_user(len, optlen)) + return -EFAULT; + +- tcp_get_info(sk, &info); ++ tcp_get_info(sk, &info, false); + + len = min_t(unsigned int, len, sizeof(info)); + if (put_user(len, optlen)) +@@ -3668,6 +3872,87 @@ static int do_tcp_getsockopt(struct sock *sk, int level, + } + return 0; + } ++#ifdef CONFIG_MPTCP ++ case MPTCP_SCHEDULER: ++ if (get_user(len, optlen)) ++ return -EFAULT; ++ len = min_t(unsigned int, len, MPTCP_SCHED_NAME_MAX); ++ if (put_user(len, optlen)) ++ return -EFAULT; ++ ++ lock_sock(sk); ++ if (mptcp(tcp_sk(sk))) { ++ struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb; ++ ++ if (copy_to_user(optval, mpcb->sched_ops->name, len)) { ++ release_sock(sk); ++ return -EFAULT; ++ } ++ } else { ++ if (copy_to_user(optval, tcp_sk(sk)->mptcp_sched_name, ++ len)) { ++ release_sock(sk); ++ return -EFAULT; ++ } ++ } ++ release_sock(sk); ++ return 0; ++ ++ case MPTCP_PATH_MANAGER: ++ if (get_user(len, optlen)) ++ return -EFAULT; ++ len = min_t(unsigned int, len, MPTCP_PM_NAME_MAX); ++ if (put_user(len, optlen)) ++ return -EFAULT; ++ ++ lock_sock(sk); ++ if (mptcp(tcp_sk(sk))) { ++ struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb; ++ ++ if (copy_to_user(optval, mpcb->pm_ops->name, len)) { ++ release_sock(sk); ++ return -EFAULT; ++ } ++ } else { ++ if (copy_to_user(optval, tcp_sk(sk)->mptcp_pm_name, ++ len)) { ++ release_sock(sk); ++ return -EFAULT; ++ } ++ } ++ release_sock(sk); ++ return 0; ++ ++ case MPTCP_ENABLED: ++ if (sk->sk_state != TCP_SYN_SENT) ++ val = mptcp(tp) ? 1 : 0; ++ else ++ val = sock_flag(sk, SOCK_MPTCP) ? 1 : 0; ++ break; ++ case MPTCP_INFO: ++ { ++ int ret; ++ ++ if (!mptcp(tp)) ++ return -EINVAL; ++ ++ if (get_user(len, optlen)) ++ return -EFAULT; ++ ++ len = min_t(unsigned int, len, sizeof(struct mptcp_info)); ++ ++ lock_sock(sk); ++ ret = mptcp_get_info(sk, optval, len); ++ release_sock(sk); ++ ++ if (ret) ++ return ret; ++ ++ if (put_user(len, optlen)) ++ return -EFAULT; ++ return 0; ++ } ++#endif + #ifdef CONFIG_MMU + case TCP_ZEROCOPY_RECEIVE: { + struct tcp_zerocopy_receive zc; +@@ -3873,7 +4158,9 @@ void tcp_done(struct sock *sk) + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + ++ WARN_ON(sk->sk_state == TCP_CLOSE); + tcp_set_state(sk, TCP_CLOSE); ++ + tcp_clear_xmit_timers(sk); + if (req) + reqsk_fastopen_remove(sk, req, false); +@@ -3889,6 +4176,8 @@ EXPORT_SYMBOL_GPL(tcp_done); + + int tcp_abort(struct sock *sk, int err) + { ++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; ++ + if (!sk_fullsock(sk)) { + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); +@@ -3902,7 +4191,7 @@ int tcp_abort(struct sock *sk, int err) + } + + /* Don't race with userspace socket closes such as tcp_close. */ +- lock_sock(sk); ++ lock_sock(meta_sk); + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); +@@ -3911,7 +4200,7 @@ int tcp_abort(struct sock *sk, int err) + + /* Don't race with BH socket closes such as inet_csk_listen_stop. */ + local_bh_disable(); +- bh_lock_sock(sk); ++ bh_lock_sock(meta_sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_err = err; +@@ -3919,14 +4208,14 @@ int tcp_abort(struct sock *sk, int err) + smp_wmb(); + sk->sk_error_report(sk); + if (tcp_need_reset(sk->sk_state)) +- tcp_send_active_reset(sk, GFP_ATOMIC); ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + } + +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + local_bh_enable(); + tcp_write_queue_purge(sk); +- release_sock(sk); ++ release_sock(meta_sk); + return 0; + } + EXPORT_SYMBOL_GPL(tcp_abort); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index 6d5600889dcf..247c1168b6a5 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -337,13 +337,19 @@ int tcp_set_allowed_congestion_control(char *val) + return ret; + } + ++int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, ++ bool reinit, bool cap_net_admin) ++{ ++ return tcp_sk(sk)->ops->set_cong_ctrl(sk, name, load, reinit, cap_net_admin); ++} ++ + /* Change congestion control for socket. If load is false, then it is the + * responsibility of the caller to call tcp_init_congestion_control or + * tcp_reinit_congestion_control (if the current congestion control was + * already initialized. + */ +-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, +- bool reinit, bool cap_net_admin) ++int __tcp_set_congestion_control(struct sock *sk, const char *name, bool load, ++ bool reinit, bool cap_net_admin) + { + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_congestion_ops *ca; +diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c +index 549506162dde..e5a530e0b1c5 100644 +--- a/net/ipv4/tcp_diag.c ++++ b/net/ipv4/tcp_diag.c +@@ -31,7 +31,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; + } + if (info) +- tcp_get_info(sk, info); ++ tcp_get_info(sk, info, false); + } + + #ifdef CONFIG_TCP_MD5SIG +diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c +index a5ec77a5ad6f..f9fb4a268b9b 100644 +--- a/net/ipv4/tcp_fastopen.c ++++ b/net/ipv4/tcp_fastopen.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + + void tcp_fastopen_init_key_once(struct net *net) + { +@@ -136,8 +137,6 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, + const siphash_key_t *key, + struct tcp_fastopen_cookie *foc) + { +- BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); +- + if (req->rsk_ops->family == AF_INET) { + const struct iphdr *iph = ip_hdr(syn); + +@@ -258,8 +257,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, + { + struct tcp_sock *tp; + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; +- struct sock *child; ++ struct sock *child, *meta_sk; + bool own_req; ++ int ret; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + NULL, &own_req); +@@ -294,15 +294,27 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, + + refcount_set(&req->rsk_refcnt, 2); + +- /* Now finish processing the fastopen child socket. */ +- tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); +- + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + + tcp_fastopen_add_skb(child, skb); + + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; + tp->rcv_wup = tp->rcv_nxt; ++ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; ++ ++ meta_sk = child; ++ ret = mptcp_check_req_fastopen(meta_sk, req); ++ if (ret < 0) ++ return NULL; ++ ++ if (ret == 0) { ++ child = tcp_sk(meta_sk)->mpcb->master_sk; ++ tp = tcp_sk(child); ++ } ++ ++ /* Now finish processing the fastopen child socket. */ ++ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); ++ + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. + */ +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index c0fcfa296468..dae2ce9656b8 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -76,35 +76,15 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include + + int sysctl_tcp_max_orphans __read_mostly = NR_FILE; + +-#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +-#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +-#define FLAG_ECE 0x40 /* ECE in this ACK */ +-#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ +-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ +-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +-#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ +-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +-#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ +-#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ +- +-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) +-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) +- + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) + #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) + +@@ -349,8 +329,12 @@ static void tcp_sndbuf_expand(struct sock *sk) + per_mss = roundup_pow_of_two(per_mss) + + SKB_DATA_ALIGN(sizeof(struct sk_buff)); + +- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); +- nr_segs = max_t(u32, nr_segs, tp->reordering + 1); ++ if (mptcp(tp)) { ++ nr_segs = mptcp_check_snd_buf(tp); ++ } else { ++ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); ++ nr_segs = max_t(u32, nr_segs, tp->reordering + 1); ++ } + + /* Fast Recovery (RFC 5681 3.2) : + * Cubic needs 1.7 factor, rounded to 2 to include +@@ -359,9 +343,17 @@ static void tcp_sndbuf_expand(struct sock *sk) + sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; + sndmem *= nr_segs * per_mss; + +- if (sk->sk_sndbuf < sndmem) ++ /* MPTCP: after this sndmem is the new contribution of the ++ * current subflow to the aggregated sndbuf */ ++ if (sk->sk_sndbuf < sndmem) { ++ int old_sndbuf = sk->sk_sndbuf; + WRITE_ONCE(sk->sk_sndbuf, + min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); ++ /* MPTCP: ok, the subflow sndbuf has grown, reflect ++ * this in the aggregate buffer.*/ ++ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf) ++ mptcp_update_sndbuf(tp); ++ } + } + + /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) +@@ -410,9 +402,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) + static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); + int room; + +- room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; ++ if (is_meta_sk(sk)) ++ return; ++ ++ room = min_t(int, meta_tp->window_clamp, tcp_space(meta_sk)) - meta_tp->rcv_ssthresh; + + /* Check #1 */ + if (room > 0 && !tcp_under_memory_pressure(sk)) { +@@ -422,13 +419,13 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) + * will fit to rcvbuf in future. + */ + if (tcp_win_from_space(sk, skb->truesize) <= skb->len) +- incr = 2 * tp->advmss; ++ incr = 2 * meta_tp->advmss; + else +- incr = __tcp_grow_window(sk, skb); ++ incr = __tcp_grow_window(meta_sk, skb); + + if (incr) { + incr = max_t(int, incr, 2 * skb->len); +- tp->rcv_ssthresh += min(room, incr); ++ meta_tp->rcv_ssthresh += min(room, incr); + inet_csk(sk)->icsk_ack.quick |= 1; } } - if (dev->flags != old_flags) { -+ /* - pr_info("device %s %s promiscuous mode\n", - dev->name, - dev->flags & IFF_PROMISC ? "entered" : "left"); -+ */ - if (audit_enabled) { - current_uid_gid(&uid, &gid); - audit_log(current->audit_context, GFP_ATOMIC, ---- a/drivers/net/usb/r8152.c 2020-08-13 13:11:25.866435255 +0200 -+++ b/drivers/net/usb/r8152.c 2020-08-13 13:11:51.973994306 +0200 -@@ -2353,7 +2353,7 @@ +@@ -612,7 +609,10 @@ void tcp_rcv_space_adjust(struct sock *sk) - if (netdev->flags & IFF_PROMISC) { - /* Unconditionally log net taps. */ -- netif_notice(tp, link, netdev, "Promiscuous mode enabled\n"); -+ //netif_notice(tp, link, netdev, "Promiscuous mode enabled\n"); - ocp_data |= RCR_AM | RCR_AAP; - mc_filter[1] = 0xffffffff; - mc_filter[0] = 0xffffffff; ---- a/drivers/net/usb/pegasus.c 2020-08-13 13:14:15.519570376 +0200 -+++ b/drivers/net/usb/pegasus.c 2020-08-13 13:14:26.795380006 +0200 -@@ -1031,7 +1031,7 @@ + tcp_mstamp_refresh(tp); + time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); +- if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) ++ if (mptcp(tp)) { ++ if (mptcp_check_rtt(tp, time)) ++ return; ++ } else if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + return; - if (net->flags & IFF_PROMISC) { - pegasus->eth_regs[EthCtrl2] |= RX_PROMISCUOUS; -- netif_info(pegasus, link, net, "Promiscuous mode enabled\n"); -+ //netif_info(pegasus, link, net, "Promiscuous mode enabled\n"); - } else if (!netdev_mc_empty(net) || (net->flags & IFF_ALLMULTI)) { - pegasus->eth_regs[EthCtrl0] |= RX_MULTICAST; - pegasus->eth_regs[EthCtrl2] &= ~RX_PROMISCUOUS; ---- a/drivers/net/ethernet/realtek/r8169_main.c 2020-08-13 13:15:44.478068638 +0200 -+++ b/drivers/net/ethernet/realtek/r8169_main.c 2020-08-13 13:15:59.181820450 +0200 -@@ -4313,7 +4313,7 @@ + /* Number of bytes copied to user in last RTT */ +@@ -835,7 +835,7 @@ static void tcp_update_pacing_rate(struct sock *sk) + /* Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ +-static void tcp_set_rto(struct sock *sk) ++void tcp_set_rto(struct sock *sk) + { + const struct tcp_sock *tp = tcp_sk(sk); + /* Old crap is replaced with new one. 8) +@@ -1407,6 +1407,13 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, + int len; + int in_sack; - if (dev->flags & IFF_PROMISC) { - /* Unconditionally log net taps. */ -- netif_notice(tp, link, dev, "Promiscuous mode enabled\n"); -+ //netif_notice(tp, link, dev, "Promiscuous mode enabled\n"); - rx_mode |= AcceptAllPhys; - } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || - dev->flags & IFF_ALLMULTI || ++ /* For MPTCP we cannot shift skb-data and remove one skb from the ++ * send-queue, because this will make us loose the DSS-option (which ++ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing. ++ */ ++ if (mptcp(tp)) ++ goto fallback; ++ + /* Normally R but no L won't result in plain S */ + if (!dup_sack && + (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) +@@ -2962,7 +2969,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, + */ + tcp_update_rtt_min(sk, ca_rtt_us, flag); + tcp_rtt_estimator(sk, seq_rtt_us); +- tcp_set_rto(sk); ++ tp->ops->set_rto(sk); + + /* RFC6298: only reset backoff on valid RTT measurement. */ + inet_csk(sk)->icsk_backoff = 0; +@@ -3030,7 +3037,7 @@ static void tcp_set_xmit_timer(struct sock *sk) + } + + /* If we get here, the whole TSO packet has not been acked. */ +-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) ++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); + u32 packets_acked; +@@ -3050,8 +3057,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) + return packets_acked; + } + +-static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, +- u32 prior_snd_una) ++void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, u32 prior_snd_una) + { + const struct skb_shared_info *shinfo; + +@@ -3156,6 +3162,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, + */ + if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { + flag |= FLAG_DATA_ACKED; ++ if (mptcp(tp) && mptcp_is_data_seq(skb)) ++ flag |= MPTCP_FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; +@@ -3276,7 +3284,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, + return flag; + } + +-static void tcp_ack_probe(struct sock *sk) ++void tcp_ack_probe(struct sock *sk) + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); +@@ -3350,9 +3358,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, + /* Check that window update is acceptable. + * The function assumes that snd_una<=ack<=snd_next. + */ +-static inline bool tcp_may_update_window(const struct tcp_sock *tp, +- const u32 ack, const u32 ack_seq, +- const u32 nwin) ++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, ++ const u32 ack_seq, const u32 nwin) + { + return after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || +@@ -3590,7 +3597,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) + } + + /* This routine deals with incoming acks, but not outgoing ones. */ +-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); +@@ -3713,6 +3720,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + tcp_rack_update_reo_wnd(sk, &rs); + ++ if (mptcp(tp)) { ++ if (mptcp_fallback_infinite(sk, flag)) { ++ pr_debug("%s resetting flow\n", __func__); ++ mptcp_send_reset(sk); ++ return -1; ++ } ++ ++ mptcp_clean_rtx_infinite(skb, sk); ++ } ++ + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + +@@ -3856,8 +3873,10 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) + */ + void tcp_parse_options(const struct net *net, + const struct sk_buff *skb, +- struct tcp_options_received *opt_rx, int estab, +- struct tcp_fastopen_cookie *foc) ++ struct tcp_options_received *opt_rx, ++ struct mptcp_options_received *mopt, ++ int estab, struct tcp_fastopen_cookie *foc, ++ struct tcp_sock *tp) + { + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); +@@ -3943,6 +3962,10 @@ void tcp_parse_options(const struct net *net, + */ + break; + #endif ++ case TCPOPT_MPTCP: ++ mptcp_parse_options(ptr - 2, opsize, mopt, skb, tp); ++ break; ++ + case TCPOPT_FASTOPEN: + tcp_parse_fastopen_option( + opsize - TCPOLEN_FASTOPEN_BASE, +@@ -4010,7 +4033,9 @@ static bool tcp_fast_parse_options(const struct net *net, + return true; + } + +- tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); ++ tcp_parse_options(net, skb, &tp->rx_opt, ++ mptcp(tp) ? &tp->mptcp->rx_opt : NULL, 1, NULL, tp); ++ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + +@@ -4120,7 +4145,7 @@ static inline bool tcp_paws_discard(const struct sock *sk, + static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) + { + return !before(end_seq, tp->rcv_wup) && +- !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); ++ !after(seq, tp->rcv_nxt + tcp_receive_window_no_shrink(tp)); + } + + /* When we get a reset we do this. */ +@@ -4169,6 +4194,11 @@ void tcp_fin(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + ++ if (is_meta_sk(sk)) { ++ mptcp_fin(sk); ++ return; ++ } ++ + inet_csk_schedule_ack(sk); + + sk->sk_shutdown |= RCV_SHUTDOWN; +@@ -4179,6 +4209,10 @@ void tcp_fin(struct sock *sk) + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); ++ ++ if (mptcp(tp)) ++ mptcp_sub_close_passive(sk); ++ + inet_csk_enter_pingpong_mode(sk); + break; + +@@ -4201,9 +4235,16 @@ void tcp_fin(struct sock *sk) + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: ++ if (mptcp(tp)) { ++ /* The socket will get closed by mptcp_data_ready. ++ * We first have to process all data-sequences. ++ */ ++ tp->close_it = 1; ++ break; ++ } + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); +- tcp_time_wait(sk, TCP_TIME_WAIT, 0); ++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these +@@ -4225,6 +4266,10 @@ void tcp_fin(struct sock *sk) + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + ++ /* Don't wake up MPTCP-subflows */ ++ if (mptcp(tp)) ++ return; ++ + /* Do not send POLL_HUP for half duplex close. */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) +@@ -4439,6 +4484,9 @@ static bool tcp_try_coalesce(struct sock *sk, + + *fragstolen = false; + ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) ++ return false; ++ + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; +@@ -4493,7 +4541,7 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb) + /* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +-static void tcp_ofo_queue(struct sock *sk) ++void tcp_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + __u32 dsack_high = tp->rcv_nxt; +@@ -4516,7 +4564,14 @@ static void tcp_ofo_queue(struct sock *sk) + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); + +- if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { ++ /* In case of MPTCP, the segment may be empty if it's a ++ * non-data DATA_FIN. (see beginning of tcp_data_queue) ++ * ++ * But this only holds true for subflows, not for the ++ * meta-socket. ++ */ ++ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) && ++ (is_meta_sk(sk) || !mptcp(tp) || TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq))) { + tcp_drop(sk, skb); + continue; + } +@@ -4546,6 +4601,9 @@ static int tcp_prune_queue(struct sock *sk); + static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + unsigned int size) + { ++ if (mptcp(tcp_sk(sk))) ++ sk = mptcp_meta_sk(sk); ++ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_rmem_schedule(sk, skb, size)) { + +@@ -4560,7 +4618,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + return 0; + } + +-static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) ++void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node **p, *parent; +@@ -4632,7 +4690,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + continue; + } + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { +- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) && ++ (is_meta_sk(sk) || !mptcp(tp) || end_seq != seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +@@ -4679,6 +4738,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + end_seq); + break; + } ++ /* MPTCP allows non-data data-fin to be in the ofo-queue */ ++ if (mptcp(tp) && !is_meta_sk(sk) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) { ++ skb = skb1; ++ continue; ++ } + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); +@@ -4690,7 +4754,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + tp->ooo_last_skb = skb; + + add_sack: +- if (tcp_is_sack(tp)) ++ if (tcp_is_sack(tp) && seq != end_seq) + tcp_sack_new_ofo_skb(sk, seq, end_seq); + end: + if (skb) { +@@ -4704,8 +4768,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + } + } + +-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, +- bool *fragstolen) ++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, ++ bool *fragstolen) + { + int eaten; + struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); +@@ -4780,7 +4844,8 @@ void tcp_data_ready(struct sock *sk) + + if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && + !sock_flag(sk, SOCK_DONE) && +- tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) ++ tcp_receive_window_now(tp) > inet_csk(sk)->icsk_ack.rcv_mss && ++ !mptcp(tp)) + return; + + sk->sk_data_ready(sk); +@@ -4792,10 +4857,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + bool fragstolen; + int eaten; + +- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { ++ /* If no data is present, but a data_fin is in the options, we still ++ * have to call mptcp_queue_skb later on. */ ++ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && ++ !(mptcp(tp) && mptcp_is_data_fin(skb))) { + __kfree_skb(skb); + return; + } ++ + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + +@@ -4806,7 +4875,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + * Out of sequence packets to the out_of_order_queue. + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { +- if (tcp_receive_window(tp) == 0) { ++ if (tcp_receive_window_no_shrink(tp) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); + goto out_of_window; + } +@@ -4822,7 +4891,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + } + + eaten = tcp_queue_rcv(sk, skb, &fragstolen); +- if (skb->len) ++ if (skb->len || mptcp_is_data_fin(skb)) + tcp_event_data_recv(sk, skb); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_fin(sk); +@@ -4844,7 +4913,11 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + + if (eaten > 0) + kfree_skb_partial(skb, fragstolen); +- if (!sock_flag(sk, SOCK_DEAD)) ++ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp)) ++ /* MPTCP: we always have to call data_ready, because ++ * we may be about to receive a data-fin, which still ++ * must get queued. ++ */ + tcp_data_ready(sk); + return; + } +@@ -4864,7 +4937,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + } + + /* Out of window. F.e. zero window probe. */ +- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) ++ if (!before(TCP_SKB_CB(skb)->seq, ++ tp->rcv_nxt + tcp_receive_window_no_shrink(tp))) + goto out_of_window; + + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { +@@ -4874,7 +4948,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + /* If window is closed, drop tail of packet. But after + * remembering D-SACK for its head made in previous line. + */ +- if (!tcp_receive_window(tp)) { ++ if (!tcp_receive_window_no_shrink(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); + goto out_of_window; + } +@@ -5187,7 +5261,7 @@ static int tcp_prune_queue(struct sock *sk) + return -1; + } + +-static bool tcp_should_expand_sndbuf(const struct sock *sk) ++bool tcp_should_expand_sndbuf(const struct sock *sk) + { + const struct tcp_sock *tp = tcp_sk(sk); + +@@ -5222,7 +5296,7 @@ static void tcp_new_space(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + +- if (tcp_should_expand_sndbuf(sk)) { ++ if (tp->ops->should_expand_sndbuf(sk)) { + tcp_sndbuf_expand(sk); + tp->snd_cwnd_stamp = tcp_jiffies32; + } +@@ -5236,10 +5310,11 @@ static void tcp_check_space(struct sock *sk) + sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); + /* pairs with tcp_poll() */ + smp_mb(); +- if (sk->sk_socket && +- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { ++ if (mptcp(tcp_sk(sk)) || ++ (sk->sk_socket && ++ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) { + tcp_new_space(sk); +- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) ++ if (sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } + } +@@ -5258,6 +5333,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + { + struct tcp_sock *tp = tcp_sk(sk); + unsigned long rtt, delay; ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && +@@ -5266,8 +5343,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (meta_tp->rcv_nxt - meta_tp->copied_seq < meta_sk->sk_rcvlowat || ++ tp->ops->__select_window(sk) >= tp->rcv_wnd)) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +@@ -5402,6 +5479,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t + { + struct tcp_sock *tp = tcp_sk(sk); + ++ /* MPTCP urgent data is not yet supported */ ++ if (mptcp(tp)) ++ return; ++ + /* Check if we get a new urgent pointer - normally not. */ + if (th->urg) + tcp_check_urg(sk, th); +@@ -5544,9 +5625,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + goto discard; + } + ++ /* If valid: post process the received MPTCP options. */ ++ if (mptcp(tp) && mptcp_handle_options(sk, th, skb)) ++ goto discard; ++ + return true; + + discard: ++ if (mptcp(tp)) ++ mptcp_reset_mopt(tp); + tcp_drop(sk, skb); + return false; + } +@@ -5603,6 +5690,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) + + tp->rx_opt.saw_tstamp = 0; + ++ /* MPTCP: force slowpath. */ ++ if (mptcp(tp)) ++ goto slow_path; ++ + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_prediction is to be made + * 'S' will always be tp->tcp_header_len >> 2 +@@ -5777,7 +5868,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) + + tcp_call_bpf(sk, bpf_op, 0, NULL); + tcp_init_congestion_control(sk); +- tcp_init_buffer_space(sk); ++ tcp_sk(sk)->ops->init_buffer_space(sk); + } + + void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) +@@ -5814,17 +5905,24 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, + struct tcp_fastopen_cookie *cookie) + { + struct tcp_sock *tp = tcp_sk(sk); +- struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; ++ struct sk_buff *data = NULL; + u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; + bool syn_drop = false; + ++ if (tp->syn_data) { ++ if (mptcp(tp)) ++ data = tcp_write_queue_head(mptcp_meta_sk(sk)); ++ else ++ data = tcp_rtx_queue_head(sk); ++ } ++ + if (mss == tp->rx_opt.user_mss) { + struct tcp_options_received opt; + + /* Get original SYNACK MSS value if user MSS sets mss_clamp */ + tcp_clear_options(&opt); + opt.user_mss = opt.mss_clamp = 0; +- tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL); ++ tcp_parse_options(sock_net(sk), synack, &opt, NULL, 0, NULL, NULL); + mss = opt.mss_clamp; + } + +@@ -5848,7 +5946,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, + + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); + +- if (data) { /* Retransmit unacked data in SYN */ ++ /* In mptcp case, we do not rely on "retransmit", but instead on ++ * "transmit", because if fastopen data is not acked, the retransmission ++ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen). ++ */ ++ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */ + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) + break; +@@ -5903,9 +6005,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_fastopen_cookie foc = { .len = -1 }; + int saved_clamp = tp->rx_opt.mss_clamp; ++ struct mptcp_options_received mopt; + bool fastopen_fail; + +- tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); ++ mptcp_init_mp_opt(&mopt); ++ ++ tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, ++ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc, tp); + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + +@@ -5966,11 +6072,41 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + tcp_try_undo_spurious_syn(sk); + tcp_ack(sk, skb, FLAG_SLOWPATH); + ++ if (tp->request_mptcp || mptcp(tp)) { ++ int ret; ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++ ret = mptcp_rcv_synsent_state_process(sk, &sk, ++ skb, &mopt); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ /* May have changed if we support MPTCP */ ++ tp = tcp_sk(sk); ++ icsk = inet_csk(sk); ++ ++ if (ret == 1) ++ goto reset_and_undo; ++ if (ret == 2) ++ goto discard; ++ } ++ ++ if (mptcp(tp) && !is_master_tp(tp)) { ++ /* Timer for repeating the ACK until an answer ++ * arrives. Used only when establishing an additional ++ * subflow inside of an MPTCP connection. ++ */ ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, ++ jiffies + icsk->icsk_rto); ++ } ++ + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; ++ tcp_update_rcv_right_edge(tp); + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. +@@ -5992,6 +6128,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + tp->tcp_header_len = sizeof(struct tcphdr); + } + ++ if (mptcp(tp)) { ++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; ++ } ++ + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + tcp_initialize_rcv_mss(sk); + +@@ -6015,9 +6156,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + } + if (fastopen_fail) + return -1; +- if (sk->sk_write_pending || ++ /* With MPTCP we cannot send data on the third ack due to the ++ * lack of option-space to combine with an MP_CAPABLE. ++ */ ++ if (!mptcp(tp) && (sk->sk_write_pending || + icsk->icsk_accept_queue.rskq_defer_accept || +- inet_csk_in_pingpong_mode(sk)) { ++ inet_csk_in_pingpong_mode(sk))) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * +@@ -6056,6 +6200,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + tcp_paws_reject(&tp->rx_opt, 0)) + goto discard_and_undo; + ++ /* TODO - check this here for MPTCP */ + if (th->syn) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. +@@ -6072,9 +6217,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + tp->tcp_header_len = sizeof(struct tcphdr); + } + ++ if (mptcp(tp)) { ++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; ++ } ++ + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; ++ tcp_update_rcv_right_edge(tp); + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. +@@ -6162,6 +6313,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) + */ + + int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) ++ __releases(&sk->sk_lock.slock) + { + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); +@@ -6204,6 +6356,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + tp->rx_opt.saw_tstamp = 0; + tcp_mstamp_refresh(tp); + queued = tcp_rcv_synsent_state_process(sk, skb, th); ++ if (is_meta_sk(sk)) { ++ sk = tcp_sk(sk)->mpcb->master_sk; ++ tp = tcp_sk(sk); ++ ++ /* Need to call it here, because it will announce new ++ * addresses, which can only be done after the third ack ++ * of the 3-way handshake. ++ */ ++ mptcp_update_metasocket(tp->meta_sk); ++ } + if (queued >= 0) + return queued; + +@@ -6276,6 +6438,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; ++ if (mptcp(tp)) ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; + + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_update_pacing_rate(sk); +@@ -6285,6 +6449,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); ++ ++ /* Send an ACK when establishing a new MPTCP subflow, i.e. ++ * using an MP_JOIN subtype. ++ */ ++ if (mptcp(tp)) { ++ if (is_master_tp(tp)) { ++ mptcp_update_metasocket(mptcp_meta_sk(sk)); ++ } else { ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ tcp_send_ack(sk); ++ ++ /* Update RTO as it might be worse/better */ ++ mptcp_set_rto(sk); ++ ++ /* If the new RTO would fire earlier, pull it in! */ ++ if (tcp_sk(meta_sk)->packets_out && ++ icsk->icsk_timeout > inet_csk(meta_sk)->icsk_rto + jiffies) { ++ tcp_rearm_rto(meta_sk); ++ } ++ ++ mptcp_push_pending_frames(mptcp_meta_sk(sk)); ++ } ++ } + break; + + case TCP_FIN_WAIT1: { +@@ -6325,7 +6513,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); +- } else if (th->fin || sock_owned_by_user(sk)) { ++ } else if (th->fin || mptcp_is_data_fin(skb) || ++ sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, +@@ -6334,7 +6523,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); ++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + break; +@@ -6342,7 +6531,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { +- tcp_time_wait(sk, TCP_TIME_WAIT, 0); ++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; +@@ -6354,6 +6543,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + goto discard; + } + break; ++ case TCP_CLOSE: ++ if (tp->mp_killed) ++ goto discard; + } + + /* step 6: check the URG bit */ +@@ -6375,7 +6567,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && +- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { ++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && ++ !mptcp(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + tcp_reset(sk); + return 1; +@@ -6477,6 +6670,8 @@ static void tcp_openreq_init(struct request_sock *req, + ireq->wscale_ok = rx_opt->wscale_ok; + ireq->acked = 0; + ireq->ecn_ok = 0; ++ ireq->mptcp_rqsk = 0; ++ ireq->saw_mpc = 0; + ireq->ir_rmt_port = tcp_hdr(skb)->source; + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); + ireq->ir_mark = inet_request_mark(sk, skb); +@@ -6602,12 +6797,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. ++ * ++ * MPTCP: new subflows cannot be established in a stateless manner. + */ +- if ((net->ipv4.sysctl_tcp_syncookies == 2 || ++ if (((!is_meta_sk(sk) && net->ipv4.sysctl_tcp_syncookies == 2) || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); + if (!want_cookie) + goto drop; ++ ++ if (is_meta_sk(sk)) ++ goto drop; + } + + if (sk_acceptq_is_full(sk)) { +@@ -6625,8 +6825,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = af_ops->mss_clamp; + tmp_opt.user_mss = tp->rx_opt.user_mss; +- tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, +- want_cookie ? NULL : &foc); ++ tcp_parse_options(sock_net(sk), skb, &tmp_opt, NULL, 0, ++ want_cookie ? NULL : &foc, NULL); + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); +@@ -6641,7 +6841,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, + /* Note: tcp_v6_init_req() might override ir_iif for link locals */ + inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); + +- af_ops->init_req(req, sk, skb); ++ if (af_ops->init_req(req, sk, skb, want_cookie)) ++ goto drop_and_free; + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; +@@ -6677,7 +6878,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, + tcp_ecn_create_request(req, skb, sk, dst); + + if (want_cookie) { +- isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); ++ isn = cookie_init_sequence(af_ops, req, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + if (!tmp_opt.tstamp_ok) + inet_rsk(req)->ecn_ok = 0; +@@ -6692,17 +6893,25 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); + } + if (fastopen_sk) { ++ struct sock *meta_sk = fastopen_sk; ++ ++ if (mptcp(tcp_sk(fastopen_sk))) ++ meta_sk = mptcp_meta_sk(fastopen_sk); + af_ops->send_synack(fastopen_sk, dst, &fl, req, + &foc, TCP_SYNACK_FASTOPEN); + /* Add the child socket directly into the accept queue */ +- if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { ++ if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) { + reqsk_fastopen_remove(fastopen_sk, req, false); + bh_unlock_sock(fastopen_sk); ++ if (meta_sk != fastopen_sk) ++ bh_unlock_sock(meta_sk); + sock_put(fastopen_sk); + goto drop_and_free; + } + sk->sk_data_ready(sk); + bh_unlock_sock(fastopen_sk); ++ if (meta_sk != fastopen_sk) ++ bh_unlock_sock(meta_sk); + sock_put(fastopen_sk); + } else { + tcp_rsk(req)->tfo_listener = false; +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 2ce85e52aea7..2e76c006ad16 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -62,6 +62,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -209,6 +211,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) + struct ip_options_rcu *inet_opt; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + ++ mptcp_init_connect(sk); ++ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + +@@ -430,7 +434,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + struct inet_sock *inet; + const int type = icmp_hdr(icmp_skb)->type; + const int code = icmp_hdr(icmp_skb)->code; +- struct sock *sk; ++ struct sock *sk, *meta_sk; + struct sk_buff *skb; + struct request_sock *fastopen; + u32 seq, snd_una; +@@ -460,13 +464,19 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + return 0; + } + +- bh_lock_sock(sk); ++ tp = tcp_sk(sk); ++ if (mptcp(tp)) ++ meta_sk = mptcp_meta_sk(sk); ++ else ++ meta_sk = sk; ++ ++ bh_lock_sock(meta_sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + * We do take care of PMTU discovery (RFC1191) special case : + * we can receive locally generated ICMP messages while socket is held. + */ +- if (sock_owned_by_user(sk)) { ++ if (sock_owned_by_user(meta_sk)) { + if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); + } +@@ -479,7 +489,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + } + + icsk = inet_csk(sk); +- tp = tcp_sk(sk); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = rcu_dereference(tp->fastopen_rsk); + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; +@@ -513,11 +522,13 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + goto out; + + WRITE_ONCE(tp->mtu_info, info); +- if (!sock_owned_by_user(sk)) { ++ if (!sock_owned_by_user(meta_sk)) { + tcp_v4_mtu_reduced(sk); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); ++ if (mptcp(tp)) ++ mptcp_tsq_flags(sk); + } + goto out; + } +@@ -531,7 +542,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + !icsk->icsk_backoff || fastopen) + break; + +- if (sock_owned_by_user(sk)) ++ if (sock_owned_by_user(meta_sk)) + break; + + skb = tcp_rtx_queue_head(sk); +@@ -555,7 +566,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now */ +- tcp_retransmit_timer(sk); ++ tcp_sk(sk)->ops->retransmit_timer(sk); + } + + break; +@@ -575,7 +586,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + if (fastopen && !fastopen->sk) + break; + +- if (!sock_owned_by_user(sk)) { ++ if (!sock_owned_by_user(meta_sk)) { + sk->sk_err = err; + + sk->sk_error_report(sk); +@@ -604,7 +615,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + */ + + inet = inet_sk(sk); +- if (!sock_owned_by_user(sk) && inet->recverr) { ++ if (!sock_owned_by_user(meta_sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ +@@ -612,7 +623,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) + } + + out: +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + sock_put(sk); + return 0; + } +@@ -648,7 +659,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); + * Exception: precedence violation. We do not implement it in any case. + */ + +-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ++void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) + { + const struct tcphdr *th = tcp_hdr(skb); + struct { +@@ -800,10 +811,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) + */ + + static void tcp_v4_send_ack(const struct sock *sk, +- struct sk_buff *skb, u32 seq, u32 ack, ++ struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, + u32 win, u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, +- int reply_flags, u8 tos) ++ int reply_flags, u8 tos, int mptcp) + { + const struct tcphdr *th = tcp_hdr(skb); + struct { +@@ -811,6 +822,10 @@ static void tcp_v4_send_ack(const struct sock *sk, + __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) + #ifdef CONFIG_TCP_MD5SIG + + (TCPOLEN_MD5SIG_ALIGNED >> 2) ++#endif ++#ifdef CONFIG_MPTCP ++ + ((MPTCP_SUB_LEN_DSS >> 2) + ++ (MPTCP_SUB_LEN_ACK >> 2)) + #endif + ]; + } rep; +@@ -858,6 +873,21 @@ static void tcp_v4_send_ack(const struct sock *sk, + ip_hdr(skb)->daddr, &rep.th); + } + #endif ++#ifdef CONFIG_MPTCP ++ if (mptcp) { ++ int offset = (tsecr) ? 3 : 0; ++ /* Construction of 32-bit data_ack */ ++ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) | ++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | ++ (0x20 << 8) | ++ (0x01)); ++ rep.opt[offset] = htonl(data_ack); ++ ++ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; ++ rep.th.doff = arg.iov[0].iov_len / 4; ++ } ++#endif /* CONFIG_MPTCP */ ++ + arg.flags = reply_flags; + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ +@@ -889,28 +919,36 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) + { + struct inet_timewait_sock *tw = inet_twsk(sk); + struct tcp_timewait_sock *tcptw = tcp_twsk(sk); ++ u32 data_ack = 0; ++ int mptcp = 0; ++ ++ if (tcptw->mptcp_tw) { ++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; ++ mptcp = 1; ++ } + + tcp_v4_send_ack(sk, skb, +- tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, ++ tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, data_ack, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcptw->tw_ts_recent, + tw->tw_bound_dev_if, + tcp_twsk_md5_key(tcptw), + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, +- tw->tw_tos ++ tw->tw_tos, mptcp + ); + + inet_twsk_put(tw); + } + +-static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, +- struct request_sock *req) ++void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, ++ struct request_sock *req) + { + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ +- u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : ++ u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ? ++ tcp_rsk(req)->snt_isn + 1 : + tcp_sk(sk)->snd_nxt; + + /* RFC 7323 2.3 +@@ -919,7 +957,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + * Rcv.Wind.Shift bits: + */ + tcp_v4_send_ack(sk, skb, seq, +- tcp_rsk(req)->rcv_nxt, ++ tcp_rsk(req)->rcv_nxt, 0, + req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + req->ts_recent, +@@ -927,7 +965,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, + AF_INET), + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, +- ip_hdr(skb)->tos); ++ ip_hdr(skb)->tos, 0); + } + + /* +@@ -935,11 +973,11 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + * This still operates on a request_sock only, not on a big + * socket. + */ +-static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, +- struct flowi *fl, +- struct request_sock *req, +- struct tcp_fastopen_cookie *foc, +- enum tcp_synack_type synack_type) ++int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ++ struct flowi *fl, ++ struct request_sock *req, ++ struct tcp_fastopen_cookie *foc, ++ enum tcp_synack_type synack_type) + { + const struct inet_request_sock *ireq = inet_rsk(req); + struct flowi4 fl4; +@@ -969,7 +1007,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + /* + * IPv4 request_sock destructor. + */ +-static void tcp_v4_reqsk_destructor(struct request_sock *req) ++void tcp_v4_reqsk_destructor(struct request_sock *req) + { + kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); + } +@@ -1354,9 +1392,10 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, + return false; + } + +-static void tcp_v4_init_req(struct request_sock *req, +- const struct sock *sk_listener, +- struct sk_buff *skb) ++static int tcp_v4_init_req(struct request_sock *req, ++ const struct sock *sk_listener, ++ struct sk_buff *skb, ++ bool want_cookie) + { + struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = sock_net(sk_listener); +@@ -1364,6 +1403,8 @@ static void tcp_v4_init_req(struct request_sock *req, + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); ++ ++ return 0; + } + + static struct dst_entry *tcp_v4_route_req(const struct sock *sk, +@@ -1383,7 +1424,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { + .syn_ack_timeout = tcp_syn_ack_timeout, + }; + +-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { ++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { + .mss_clamp = TCP_MSS_DEFAULT, + #ifdef CONFIG_TCP_MD5SIG + .req_md5_lookup = tcp_v4_md5_lookup, +@@ -1520,7 +1561,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + } + EXPORT_SYMBOL(tcp_v4_syn_recv_sock); + +-static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) ++struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) + { + #ifdef CONFIG_SYN_COOKIES + const struct tcphdr *th = tcp_hdr(skb); +@@ -1558,6 +1599,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + { + struct sock *rsk; + ++ if (is_meta_sk(sk)) ++ return mptcp_v4_do_rcv(sk, skb); ++ + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + +@@ -1803,6 +1847,10 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); ++#ifdef CONFIG_MPTCP ++ TCP_SKB_CB(skb)->mptcp_flags = 0; ++ TCP_SKB_CB(skb)->dss_off = 0; ++#endif + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); +@@ -1822,8 +1870,8 @@ int tcp_v4_rcv(struct sk_buff *skb) + int sdif = inet_sdif(skb); + const struct iphdr *iph; + const struct tcphdr *th; ++ struct sock *sk, *meta_sk = NULL; + bool refcounted; +- struct sock *sk; + int ret; + + if (skb->pkt_type != PACKET_HOST) +@@ -1877,7 +1925,11 @@ int tcp_v4_rcv(struct sk_buff *skb) + reqsk_put(req); + goto csum_error; + } +- if (unlikely(sk->sk_state != TCP_LISTEN)) { ++ if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) { ++ inet_csk_reqsk_queue_drop_and_put(sk, req); ++ goto lookup; ++ } ++ if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } +@@ -1886,6 +1938,7 @@ int tcp_v4_rcv(struct sk_buff *skb) + */ + sock_hold(sk); + refcounted = true; ++ + nsk = NULL; + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; +@@ -1946,19 +1999,28 @@ int tcp_v4_rcv(struct sk_buff *skb) + + sk_incoming_cpu_update(sk); + +- bh_lock_sock_nested(sk); ++ if (mptcp(tcp_sk(sk))) { ++ meta_sk = mptcp_meta_sk(sk); ++ ++ bh_lock_sock_nested(meta_sk); ++ if (sock_owned_by_user(meta_sk)) ++ mptcp_prepare_for_backlog(sk, skb); ++ } else { ++ meta_sk = sk; ++ bh_lock_sock_nested(sk); ++ } + tcp_segs_in(tcp_sk(sk), skb); + ret = 0; +- if (!sock_owned_by_user(sk)) { ++ if (!sock_owned_by_user(meta_sk)) { + skb_to_free = sk->sk_rx_skb_cache; + sk->sk_rx_skb_cache = NULL; + ret = tcp_v4_do_rcv(sk, skb); + } else { +- if (tcp_add_backlog(sk, skb)) ++ if (tcp_add_backlog(meta_sk, skb)) + goto discard_and_relse; + skb_to_free = NULL; + } +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + if (skb_to_free) + __kfree_skb(skb_to_free); + +@@ -1974,6 +2036,19 @@ int tcp_v4_rcv(struct sk_buff *skb) + + tcp_v4_fill_cb(skb, iph, th); + ++#ifdef CONFIG_MPTCP ++ if (!sk && th->syn && !th->ack) { ++ int ret = mptcp_lookup_join(skb, NULL); ++ ++ if (ret < 0) { ++ tcp_v4_send_reset(NULL, skb); ++ goto discard_it; ++ } else if (ret > 0) { ++ return 0; ++ } ++ } ++#endif ++ + if (tcp_checksum_complete(skb)) { + csum_error: + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); +@@ -2022,6 +2097,18 @@ int tcp_v4_rcv(struct sk_buff *skb) + refcounted = false; + goto process; + } ++#ifdef CONFIG_MPTCP ++ if (th->syn && !th->ack) { ++ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); ++ ++ if (ret < 0) { ++ tcp_v4_send_reset(NULL, skb); ++ goto discard_it; ++ } else if (ret > 0) { ++ return 0; ++ } ++ } ++#endif + } + /* to ACK */ + /* fall through */ +@@ -2091,7 +2178,12 @@ static int tcp_v4_init_sock(struct sock *sk) + + tcp_init_sock(sk); + +- icsk->icsk_af_ops = &ipv4_specific; ++#ifdef CONFIG_MPTCP ++ if (sock_flag(sk, SOCK_MPTCP)) ++ icsk->icsk_af_ops = &mptcp_v4_specific; ++ else ++#endif ++ icsk->icsk_af_ops = &ipv4_specific; + + #ifdef CONFIG_TCP_MD5SIG + tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; +@@ -2110,6 +2202,11 @@ void tcp_v4_destroy_sock(struct sock *sk) + + tcp_cleanup_congestion_control(sk); + ++ if (mptcp(tp)) ++ mptcp_destroy_sock(sk); ++ if (tp->inside_tk_table) ++ mptcp_hash_remove_bh(tp); ++ + tcp_cleanup_ulp(sk); + + /* Cleanup up the write buffer. */ +@@ -2615,6 +2712,11 @@ struct proto tcp_prot = { + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp_sock), ++#ifdef CONFIG_MPTCP ++ .useroffset = offsetof(struct tcp_sock, mptcp_sched_name), ++ .usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) + ++ sizeof_field(struct tcp_sock, mptcp_pm_name), ++#endif + .slab_flags = SLAB_TYPESAFE_BY_RCU, + .twsk_prot = &tcp_timewait_sock_ops, + .rsk_prot = &tcp_request_sock_ops, +@@ -2625,6 +2727,9 @@ struct proto tcp_prot = { + .compat_getsockopt = compat_tcp_getsockopt, + #endif + .diag_destroy = tcp_abort, ++#ifdef CONFIG_MPTCP ++ .clear_sk = mptcp_clear_sk, ++#endif + }; + EXPORT_SYMBOL(tcp_prot); + +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 194743bd3fc1..b35942faf7df 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -19,11 +19,13 @@ + * Jorge Cwik, + */ + ++#include + #include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -95,10 +97,14 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + struct tcp_options_received tmp_opt; + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + bool paws_reject = false; ++ struct mptcp_options_received mopt; + + tmp_opt.saw_tstamp = 0; +- if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { +- tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); ++ if (th->doff > (sizeof(*th) >> 2) && ++ (tcptw->tw_ts_recent_stamp || tcptw->mptcp_tw)) { ++ mptcp_init_mp_opt(&mopt); ++ ++ tcp_parse_options(twsk_net(tw), skb, &tmp_opt, &mopt, 0, NULL, NULL); + + if (tmp_opt.saw_tstamp) { + if (tmp_opt.rcv_tsecr) +@@ -107,6 +113,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); + } ++ ++ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) { ++ if (mopt.mptcp_sender_key == tcptw->mptcp_tw->loc_key) ++ return TCP_TW_RST; ++ } + } + + if (tw->tw_substate == TCP_FIN_WAIT2) { +@@ -130,6 +141,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + if (!th->ack || + !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { ++ /* If mptcp_is_data_fin() returns true, we are sure that ++ * mopt has been initialized - otherwise it would not ++ * be a DATA_FIN. ++ */ ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw && ++ mptcp_is_data_fin(skb) && ++ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && ++ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt) ++ return TCP_TW_ACK; ++ + inet_twsk_put(tw); + return TCP_TW_SUCCESS; + } +@@ -270,11 +291,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; +- tcptw->tw_rcv_wnd = tcp_receive_window(tp); ++ /* no need to keep track of the right-most right edge ++ * when in time wait, can directly use the currently ++ * advertised window. ++ */ ++ tcptw->tw_rcv_wnd = tcp_receive_window_now(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + tcptw->tw_ts_offset = tp->tsoffset; + tcptw->tw_last_oow_ack_time = 0; ++ ++ if (mptcp(tp)) { ++ if (mptcp_init_tw_sock(sk, tcptw)) { ++ inet_twsk_free(tw); ++ goto exit; ++ } ++ } else { ++ tcptw->mptcp_tw = NULL; ++ } ++ + tcptw->tw_tx_delay = tp->tcp_tx_delay; + #if IS_ENABLED(CONFIG_IPV6) + if (tw->tw_family == PF_INET6) { +@@ -336,6 +371,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); + } + ++exit: + tcp_update_metrics(sk); + tcp_done(sk); + } +@@ -343,6 +379,10 @@ EXPORT_SYMBOL(tcp_time_wait); + + void tcp_twsk_destructor(struct sock *sk) + { ++ struct tcp_timewait_sock *twsk = tcp_twsk(sk); ++ ++ if (twsk->mptcp_tw) ++ mptcp_twsk_destructor(twsk); + #ifdef CONFIG_TCP_MD5SIG + if (static_branch_unlikely(&tcp_md5_needed)) { + struct tcp_timewait_sock *twsk = tcp_twsk(sk); +@@ -386,8 +426,9 @@ void tcp_openreq_init_rwin(struct request_sock *req, + full_space = rcv_wnd * mss; + + /* tcp_full_space because it is guaranteed to be the first packet */ +- tcp_select_initial_window(sk_listener, full_space, +- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), ++ tp->ops->select_initial_window(sk_listener, full_space, ++ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) - ++ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0), + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, + ireq->wscale_ok, +@@ -487,6 +528,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; + ++ newtp->out_of_order_queue = RB_ROOT; ++ newsk->tcp_rtx_queue = RB_ROOT; + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); + +@@ -511,6 +554,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; ++ newtp->rcv_right_edge = newtp->rcv_wnd + newtp->rcv_wup; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; +@@ -530,6 +574,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } ++ if (ireq->saw_mpc) ++ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; + if (req->num_timeout) { + newtp->undo_marker = treq->snt_isn; + newtp->retrans_stamp = div_u64(treq->snt_synack, +@@ -547,6 +593,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, + tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; + RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); ++ newtp->inside_tk_table = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + +@@ -570,15 +617,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + bool fastopen, bool *req_stolen) + { + struct tcp_options_received tmp_opt; ++ struct mptcp_options_received mopt; + struct sock *child; + const struct tcphdr *th = tcp_hdr(skb); + __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + bool paws_reject = false; + bool own_req; ++ bool meta_locked = false; + + tmp_opt.saw_tstamp = 0; ++ ++ mptcp_init_mp_opt(&mopt); ++ + if (th->doff > (sizeof(struct tcphdr)>>2)) { +- tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); ++ tcp_parse_options(sock_net(sk), skb, &tmp_opt, &mopt, 0, NULL, NULL); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = req->ts_recent; +@@ -619,7 +671,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + * + * Reset timer after retransmitting SYNACK, similar to + * the idea of fast retransmit in recovery. ++ * ++ * Fall back to TCP if MP_CAPABLE is not set. + */ ++ ++ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc) ++ inet_rsk(req)->saw_mpc = false; ++ ++ + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time) && +@@ -767,17 +826,40 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ ++ if (is_meta_sk(sk)) { ++ bh_lock_sock_nested(sk); ++ meta_locked = true; ++ } + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); + if (!child) + goto listen_overflow; + ++ if (own_req && !is_meta_sk(sk)) { ++ int ret = mptcp_check_req_master(sk, child, req, skb, &mopt, 1, 0); ++ if (ret < 0) ++ goto listen_overflow; ++ ++ /* MPTCP-supported */ ++ if (!ret) ++ return tcp_sk(child)->mpcb->master_sk; ++ } else if (own_req) { ++ return mptcp_check_req_child(sk, child, req, skb, &mopt); ++ } ++ ++ if (meta_locked) ++ bh_unlock_sock(sk); ++ + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); + *req_stolen = !own_req; ++ + return inet_csk_complete_hashdance(sk, child, req, own_req); + + listen_overflow: ++ if (meta_locked) ++ bh_unlock_sock(sk); ++ + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { + inet_rsk(req)->acked = 1; + return NULL; +@@ -823,12 +905,13 @@ int tcp_child_process(struct sock *parent, struct sock *child, + { + int ret = 0; + int state = child->sk_state; ++ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child; + + /* record NAPI ID of child */ + sk_mark_napi_id(child, skb); + + tcp_segs_in(tcp_sk(child), skb); +- if (!sock_owned_by_user(child)) { ++ if (!sock_owned_by_user(meta_sk)) { + ret = tcp_rcv_state_process(child, skb); + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->sk_state != state) +@@ -838,10 +921,14 @@ int tcp_child_process(struct sock *parent, struct sock *child, + * in main socket hash table and lock on listening + * socket does not protect us more. + */ +- __sk_add_backlog(child, skb); ++ if (mptcp(tcp_sk(child))) ++ mptcp_prepare_for_backlog(child, skb); ++ __sk_add_backlog(meta_sk, skb); + } + + bh_unlock_sock(child); ++ if (mptcp(tcp_sk(child))) ++ bh_unlock_sock(meta_sk); + sock_put(child); + return ret; + } +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 638d7b49ad71..d246e537e686 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -37,6 +37,12 @@ + + #define pr_fmt(fmt) "TCP: " fmt + ++#include ++#include ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#endif ++#include + #include + + #include +@@ -57,11 +63,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) + tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); + } + +-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +- int push_one, gfp_t gfp); +- + /* Account for new data that has been sent to the network. */ +-static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) ++void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); +@@ -255,12 +258,16 @@ EXPORT_SYMBOL(tcp_select_initial_window); + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +-static u16 tcp_select_window(struct sock *sk) ++u16 tcp_select_window(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; +- u32 cur_win = tcp_receive_window(tp); +- u32 new_win = __tcp_select_window(sk); ++ /* The window must never shrink at the meta-level. At the subflow we ++ * have to allow this. Otherwise we may announce a window too large ++ * for the current meta-level sk_rcvbuf. ++ */ ++ u32 cur_win = tcp_receive_window_now(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp); ++ u32 new_win = tp->ops->__select_window(sk); + + /* Never shrink the offered window */ + if (new_win < cur_win) { +@@ -276,8 +283,10 @@ static u16 tcp_select_window(struct sock *sk) + LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } ++ + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; ++ tcp_update_rcv_right_edge(tp); + + /* Make sure we do not exceed the maximum possible + * scaled window. +@@ -388,7 +397,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + /* Constructs common control bits of non-data skb. If SYN/FIN is present, + * auto increment end seqno. + */ +-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) ++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) + { + skb->ip_summed = CHECKSUM_PARTIAL; + +@@ -403,7 +412,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) + TCP_SKB_CB(skb)->end_seq = seq; + } + +-static inline bool tcp_urg_mode(const struct tcp_sock *tp) ++bool tcp_urg_mode(const struct tcp_sock *tp) + { + return tp->snd_una != tp->snd_up; + } +@@ -414,6 +423,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) + #define OPTION_WSCALE (1 << 3) + #define OPTION_FAST_OPEN_COOKIE (1 << 8) + #define OPTION_SMC (1 << 9) ++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */ + + static void smc_options_write(__be32 *ptr, u16 *options) + { +@@ -430,17 +440,6 @@ static void smc_options_write(__be32 *ptr, u16 *options) + #endif + } + +-struct tcp_out_options { +- u16 options; /* bit field of OPTION_* */ +- u16 mss; /* 0 to disable */ +- u8 ws; /* window scale, 0 to disable */ +- u8 num_sack_blocks; /* number of SACK blocks to include */ +- u8 hash_size; /* bytes in hash_location */ +- __u8 *hash_location; /* temporary pointer, overloaded */ +- __u32 tsval, tsecr; /* need to include OPTION_TS */ +- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ +-}; +- + /* Write previously computed TCP options to the packet. + * + * Beware: Something in the Internet is very sensitive to the ordering of +@@ -455,7 +454,7 @@ struct tcp_out_options { + * (but it may well be that other scenarios fail similarly). + */ + static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, +- struct tcp_out_options *opts) ++ struct tcp_out_options *opts, struct sk_buff *skb) + { + u16 options = opts->options; /* mungable copy */ + +@@ -549,6 +548,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, + } + + smc_options_write(ptr, &options); ++ ++ if (unlikely(OPTION_MPTCP & opts->options)) ++ mptcp_options_write(ptr, tp, opts, skb); + } + + static void smc_set_option(const struct tcp_sock *tp, +@@ -635,6 +637,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, + if (unlikely(!(OPTION_TS & opts->options))) + remaining -= TCPOLEN_SACKPERM_ALIGNED; + } ++ if (tp->request_mptcp || mptcp(tp)) ++ mptcp_syn_options(sk, opts, &remaining); + + if (fastopen && fastopen->cookie.len >= 0) { + u32 need = fastopen->cookie.len; +@@ -718,6 +722,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, + + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + ++ if (ireq->saw_mpc) ++ mptcp_synack_options(req, opts, &remaining); ++ + return MAX_TCP_OPTION_SPACE - remaining; + } + +@@ -752,14 +759,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } ++ if (mptcp(tp)) ++ mptcp_established_options(sk, skb, opts, &size); + + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; + if (unlikely(eff_sacks)) { +- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; +- opts->num_sack_blocks = +- min_t(unsigned int, eff_sacks, +- (remaining - TCPOLEN_SACK_BASE_ALIGNED) / +- TCPOLEN_SACK_PERBLOCK); ++ const unsigned remaining = MAX_TCP_OPTION_SPACE - size; ++ if (remaining < TCPOLEN_SACK_BASE_ALIGNED) ++ opts->num_sack_blocks = 0; ++ else ++ opts->num_sack_blocks = ++ min_t(unsigned int, eff_sacks, ++ (remaining - TCPOLEN_SACK_BASE_ALIGNED) / ++ TCPOLEN_SACK_PERBLOCK); + if (likely(opts->num_sack_blocks)) + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; +@@ -802,19 +814,31 @@ static void tcp_tsq_write(struct sock *sk) + tcp_xmit_retransmit_queue(sk); + } + +- tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, +- 0, GFP_ATOMIC); ++ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk), ++ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC); + } + } + + static void tcp_tsq_handler(struct sock *sk) + { +- bh_lock_sock(sk); +- if (!sock_owned_by_user(sk)) ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; ++ ++ bh_lock_sock(meta_sk); ++ if (!sock_owned_by_user(meta_sk)) { + tcp_tsq_write(sk); +- else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) +- sock_hold(sk); +- bh_unlock_sock(sk); ++ ++ if (mptcp(tp)) ++ tcp_tsq_write(meta_sk); ++ } else { ++ if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) ++ sock_hold(sk); ++ ++ if ((mptcp(tp)) && (sk->sk_state != TCP_CLOSE)) ++ mptcp_tsq_flags(sk); ++ } ++ ++ bh_unlock_sock(meta_sk); + } + /* + * One tasklet per cpu tries to send more skbs. +@@ -851,7 +875,9 @@ static void tcp_tasklet_func(unsigned long data) + #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ +- TCPF_MTU_REDUCED_DEFERRED) ++ TCPF_MTU_REDUCED_DEFERRED | \ ++ TCPF_PATH_MANAGER_DEFERRED |\ ++ TCPF_SUB_DEFERRED) + /** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket +@@ -874,6 +900,9 @@ void tcp_release_cb(struct sock *sk) + if (flags & TCPF_TSQ_DEFERRED) { + tcp_tsq_write(sk); + __sock_put(sk); ++ ++ if (mptcp(tcp_sk(sk))) ++ tcp_tsq_write(mptcp_meta_sk(sk)); + } + /* Here begins the tricky part : + * We are called from release_sock() with : +@@ -898,6 +927,13 @@ void tcp_release_cb(struct sock *sk) + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); + __sock_put(sk); + } ++ if (flags & TCPF_PATH_MANAGER_DEFERRED) { ++ if (tcp_sk(sk)->mpcb->pm_ops->release_sock) ++ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk); ++ __sock_put(sk); ++ } ++ if (flags & TCPF_SUB_DEFERRED) ++ mptcp_tsq_sub_deferred(sk); + } + EXPORT_SYMBOL(tcp_release_cb); + +@@ -981,8 +1017,8 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) + return HRTIMER_NORESTART; + } + +-static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, +- u64 prior_wstamp) ++void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, ++ u64 prior_wstamp) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -1128,10 +1164,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + } + } + +- tcp_options_write((__be32 *)(th + 1), tp, &opts); ++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; + if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { +- th->window = htons(tcp_select_window(sk)); ++ th->window = htons(tp->ops->select_window(sk)); + tcp_ecn_send(sk, skb, th, tcp_header_size); + } else { + /* RFC1323: The window in SYN & SYN/ACK segments +@@ -1189,8 +1225,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + return err; + } + +-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, +- gfp_t gfp_mask) ++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ++ gfp_t gfp_mask) + { + return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +@@ -1201,7 +1237,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. + */ +-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) ++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -1214,7 +1250,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) + } + + /* Initialize TSO segments for a packet. */ +-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) ++void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) + { + if (skb->len <= mss_now) { + /* Avoid the costly divide in the normal +@@ -1231,7 +1267,7 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) + /* Pcount in the middle of the write queue got changed, we need to do various + * tweaks to fix counters + */ +-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) ++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -1400,7 +1436,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + /* This is similar to __pskb_pull_tail(). The difference is that pulled + * data is not copied, but immediately discarded. + */ +-static int __pskb_trim_head(struct sk_buff *skb, int len) ++int __pskb_trim_head(struct sk_buff *skb, int len) + { + struct skb_shared_info *shinfo; + int i, k, eat; +@@ -1623,6 +1659,7 @@ unsigned int tcp_current_mss(struct sock *sk) + + return mss_now; + } ++EXPORT_SYMBOL(tcp_current_mss); + + /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, +@@ -1682,8 +1719,11 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) + * 2) not cwnd limited (this else condition) + * 3) no more data to send (tcp_write_queue_empty()) + * 4) application is hitting buffer limit (SOCK_NOSPACE) ++ * 5) For MPTCP subflows, the scheduler determines ++ * sndbuf limited. + */ + if (tcp_write_queue_empty(sk) && sk->sk_socket && ++ !(mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && + (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); +@@ -1705,8 +1745,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp) + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now + */ +-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, +- const struct sk_buff *skb) ++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, ++ const struct sk_buff *skb) + { + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +@@ -1752,7 +1792,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + /* Return the number of segments we want in the skb we are transmitting. + * See if congestion control module wants to decide; otherwise, autosize. + */ +-static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) ++u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + u32 min_tso, tso_segs; +@@ -1766,11 +1806,11 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + } + + /* Returns the portion of skb which can be sent right away */ +-static unsigned int tcp_mss_split_point(const struct sock *sk, +- const struct sk_buff *skb, +- unsigned int mss_now, +- unsigned int max_segs, +- int nonagle) ++unsigned int tcp_mss_split_point(const struct sock *sk, ++ const struct sk_buff *skb, ++ unsigned int mss_now, ++ unsigned int max_segs, ++ int nonagle) + { + const struct tcp_sock *tp = tcp_sk(sk); + u32 partial, needed, window, max_len; +@@ -1800,13 +1840,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, + /* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, +- const struct sk_buff *skb) ++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, ++ const struct sk_buff *skb) + { + u32 in_flight, cwnd, halfcwnd; + + /* Don't be strict about the congestion window for the final FIN. */ +- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && ++ if (skb && ++ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + tcp_skb_pcount(skb) == 1) + return 1; + +@@ -1821,12 +1862,13 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, + halfcwnd = max(cwnd >> 1, 1U); + return min(halfcwnd, cwnd - in_flight); + } ++EXPORT_SYMBOL(tcp_cwnd_test); + + /* Initialize TSO state of a skb. + * This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +-static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) ++int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) + { + int tso_segs = tcp_skb_pcount(skb); + +@@ -1841,8 +1883,8 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) + /* Return true if the Nagle test allows this packet to be + * sent now. + */ +-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, +- unsigned int cur_mss, int nonagle) ++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, ++ unsigned int cur_mss, int nonagle) + { + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). +@@ -1854,7 +1896,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf + return true; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ +- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) ++ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || ++ mptcp_is_data_fin(skb)) + return true; + + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) +@@ -1864,9 +1907,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf + } + + /* Does at least the first segment of SKB fit into the send window? */ +-static bool tcp_snd_wnd_test(const struct tcp_sock *tp, +- const struct sk_buff *skb, +- unsigned int cur_mss) ++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, ++ unsigned int cur_mss) + { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + +@@ -1875,6 +1917,7 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, + + return !after(end_seq, tcp_wnd_end(tp)); + } ++EXPORT_SYMBOL(tcp_snd_wnd_test); + + /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like +@@ -2033,7 +2076,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + + /* If this packet won't get more data, do not wait. */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || +- TCP_SKB_CB(skb)->eor) ++ TCP_SKB_CB(skb)->eor || ++ mptcp_is_data_fin(skb)) + goto send_now; + + return true; +@@ -2366,7 +2410,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) + * Returns true, if no segments are in flight and we have queued segments, + * but cannot send anything now because of SWS or another problem. + */ +-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -2380,7 +2424,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + sent_pkts = 0; + + tcp_mstamp_refresh(tp); +- if (!push_one) { ++ ++ /* pmtu not yet supported with MPTCP. Should be possible, by early ++ * exiting the loop inside tcp_mtu_probe, making sure that only one ++ * single DSS-mapping gets probed. ++ */ ++ if (!push_one && !mptcp(tp)) { + /* Do MTU probing. */ + result = tcp_mtu_probe(sk); + if (!result) { +@@ -2576,7 +2625,7 @@ void tcp_send_loss_probe(struct sock *sk) + skb = tcp_send_head(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; +- tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); ++ tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; +@@ -2638,8 +2687,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, + if (unlikely(sk->sk_state == TCP_CLOSE)) + return; + +- if (tcp_write_xmit(sk, cur_mss, nonagle, 0, +- sk_gfp_mask(sk, GFP_ATOMIC))) ++ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0, ++ sk_gfp_mask(sk, GFP_ATOMIC))) + tcp_check_probe_timer(sk); + } + +@@ -2652,7 +2701,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) + + BUG_ON(!skb || skb->len < mss_now); + +- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); ++ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, ++ sk->sk_allocation); + } + + /* This function returns the amount that we can raise the +@@ -2874,6 +2924,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + return; + ++ /* Currently not supported for MPTCP - but it should be possible */ ++ if (mptcp(tp)) ++ return; ++ + skb_rbtree_walk_from_safe(skb, tmp) { + if (!tcp_can_collapse(sk, skb)) + break; +@@ -3355,7 +3409,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); +- tcp_options_write((__be32 *)(th + 1), NULL, &opts); ++ tcp_options_write((__be32 *)(th + 1), NULL, &opts, skb); + th->doff = (tcp_header_size >> 2); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + +@@ -3437,13 +3491,13 @@ static void tcp_connect_init(struct sock *sk) + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + +- tcp_select_initial_window(sk, tcp_full_space(sk), +- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), +- &tp->rcv_wnd, +- &tp->window_clamp, +- sock_net(sk)->ipv4.sysctl_tcp_window_scaling, +- &rcv_wscale, +- rcv_wnd); ++ tp->ops->select_initial_window(sk, tcp_full_space(sk), ++ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), ++ &tp->rcv_wnd, ++ &tp->window_clamp, ++ sock_net(sk)->ipv4.sysctl_tcp_window_scaling, ++ &rcv_wscale, ++ rcv_wnd); + + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rcv_ssthresh = tp->rcv_wnd; +@@ -3463,11 +3517,43 @@ static void tcp_connect_init(struct sock *sk) + else + tp->rcv_tstamp = tcp_jiffies32; + tp->rcv_wup = tp->rcv_nxt; ++ /* force set rcv_right_edge here at start of connection */ ++ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); + + inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); + inet_csk(sk)->icsk_retransmits = 0; + tcp_clear_retrans(tp); ++ ++#ifdef CONFIG_MPTCP ++ if (sock_flag(sk, SOCK_MPTCP) && mptcp_doit(sk)) { ++ if (is_master_tp(tp)) { ++ tp->request_mptcp = 1; ++ mptcp_connect_init(sk); ++ } else if (tp->mptcp) { ++ struct inet_sock *inet = inet_sk(sk); ++ ++ tp->mptcp->snt_isn = tp->write_seq; ++ tp->mptcp->init_rcv_wnd = tp->rcv_wnd; ++ ++ /* Set nonce for new subflows */ ++ if (sk->sk_family == AF_INET) ++ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce( ++ inet->inet_saddr, ++ inet->inet_daddr, ++ inet->inet_sport, ++ inet->inet_dport); ++#if IS_ENABLED(CONFIG_IPV6) ++ else ++ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce( ++ inet6_sk(sk)->saddr.s6_addr32, ++ sk->sk_v6_daddr.s6_addr32, ++ inet->inet_sport, ++ inet->inet_dport); ++#endif ++ } ++ } ++#endif + } + + static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +@@ -3731,6 +3817,7 @@ void tcp_send_ack(struct sock *sk) + { + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + } ++EXPORT_SYMBOL_GPL(tcp_send_ack); + + /* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. +@@ -3743,7 +3830,7 @@ void tcp_send_ack(struct sock *sk) + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is + * out-of-date with SND.UNA-1 to probe window. + */ +-static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) ++int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; +@@ -3830,7 +3917,7 @@ void tcp_send_probe0(struct sock *sk) + unsigned long timeout; + int err; + +- err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); ++ err = tp->ops->write_wakeup(sk, LINUX_MIB_TCPWINPROBE); + + if (tp->packets_out || tcp_write_queue_empty(sk)) { + /* Cancel probe timer, if it is not required. */ +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index fa2ae96ecdc4..36199efe2837 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -21,6 +21,7 @@ + + #include + #include ++#include + #include + + static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) +@@ -65,7 +66,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) + * Returns: Nothing (void) + */ + +-static void tcp_write_err(struct sock *sk) ++void tcp_write_err(struct sock *sk) + { + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); +@@ -121,7 +122,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) + (!tp->snd_wnd && !tp->packets_out)) + do_reset = true; + if (do_reset) +- tcp_send_active_reset(sk, GFP_ATOMIC); ++ tp->ops->send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + return 1; +@@ -206,9 +207,9 @@ static unsigned int tcp_model_timeout(struct sock *sk, + * after "boundary" unsuccessful, exponentially backed-off + * retransmissions with an initial RTO of TCP_RTO_MIN. + */ +-static bool retransmits_timed_out(struct sock *sk, +- unsigned int boundary, +- unsigned int timeout) ++bool retransmits_timed_out(struct sock *sk, ++ unsigned int boundary, ++ unsigned int timeout) + { + unsigned int start_ts; + +@@ -228,7 +229,7 @@ static bool retransmits_timed_out(struct sock *sk, + } + + /* A write timeout has occurred. Process the after effects. */ +-static int tcp_write_timeout(struct sock *sk) ++int tcp_write_timeout(struct sock *sk) + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); +@@ -243,6 +244,17 @@ static int tcp_write_timeout(struct sock *sk) + sk_rethink_txhash(sk); + } + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; ++ ++#ifdef CONFIG_MPTCP ++ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */ ++ if (tcp_sk(sk)->request_mptcp && ++ icsk->icsk_retransmits >= sysctl_mptcp_syn_retries) { ++ tcp_sk(sk)->request_mptcp = 0; ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLERETRANSFALLBACK); ++ } ++#endif /* CONFIG_MPTCP */ ++ + expired = icsk->icsk_retransmits >= retry_until; + } else { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { +@@ -338,18 +350,22 @@ static void tcp_delack_timer(struct timer_list *t) + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; + +- bh_lock_sock(sk); +- if (!sock_owned_by_user(sk)) { ++ bh_lock_sock(meta_sk); ++ if (!sock_owned_by_user(meta_sk)) { + tcp_delack_timer_handler(sk); + } else { + icsk->icsk_ack.blocked = 1; +- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); ++ __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED); + /* deleguate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); ++ if (mptcp(tp)) ++ mptcp_tsq_flags(sk); + } +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + sock_put(sk); + } + +@@ -393,7 +409,12 @@ static void tcp_probe_timer(struct sock *sk) + } + + if (icsk->icsk_probes_out >= max_probes) { +-abort: tcp_write_err(sk); ++abort: ++ tcp_write_err(sk); ++ if (is_meta_sk(sk) && ++ mptcp_in_infinite_mapping_weak(tp->mpcb)) { ++ mptcp_sub_force_close_all(tp->mpcb, NULL); ++ } + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); +@@ -614,7 +635,7 @@ void tcp_write_timer_handler(struct sock *sk) + break; + case ICSK_TIME_RETRANS: + icsk->icsk_pending = 0; +- tcp_retransmit_timer(sk); ++ tcp_sk(sk)->ops->retransmit_timer(sk); + break; + case ICSK_TIME_PROBE0: + icsk->icsk_pending = 0; +@@ -631,16 +652,19 @@ static void tcp_write_timer(struct timer_list *t) + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; ++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; + +- bh_lock_sock(sk); +- if (!sock_owned_by_user(sk)) { ++ bh_lock_sock(meta_sk); ++ if (!sock_owned_by_user(meta_sk)) { + tcp_write_timer_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); ++ if (mptcp(tcp_sk(sk))) ++ mptcp_tsq_flags(sk); + } +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + sock_put(sk); + } + +@@ -670,11 +694,12 @@ static void tcp_keepalive_timer (struct timer_list *t) + struct sock *sk = from_timer(sk, t, sk_timer); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; + u32 elapsed; + + /* Only process if socket is not in use. */ +- bh_lock_sock(sk); +- if (sock_owned_by_user(sk)) { ++ bh_lock_sock(meta_sk); ++ if (sock_owned_by_user(meta_sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer (sk, HZ/20); + goto out; +@@ -686,16 +711,31 @@ static void tcp_keepalive_timer (struct timer_list *t) + } + + tcp_mstamp_refresh(tp); ++ ++ if (tp->send_mp_fclose) { ++ if (icsk->icsk_retransmits >= MPTCP_FASTCLOSE_RETRIES) { ++ tcp_write_err(sk); ++ goto out; ++ } ++ ++ tcp_send_ack(sk); ++ icsk->icsk_retransmits++; ++ ++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); ++ elapsed = icsk->icsk_rto; ++ goto resched; ++ } ++ + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { + if (tp->linger2 >= 0) { + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); ++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } +- tcp_send_active_reset(sk, GFP_ATOMIC); ++ tp->ops->send_active_reset(sk, GFP_ATOMIC); + goto death; + } + +@@ -720,11 +760,11 @@ static void tcp_keepalive_timer (struct timer_list *t) + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { +- tcp_send_active_reset(sk, GFP_ATOMIC); ++ tp->ops->send_active_reset(sk, GFP_ATOMIC); + tcp_write_err(sk); + goto out; + } +- if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { ++ if (tp->ops->write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { + icsk->icsk_probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { +@@ -748,7 +788,7 @@ static void tcp_keepalive_timer (struct timer_list *t) + tcp_done(sk); + + out: +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + sock_put(sk); + } + +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c +index 366c3792b860..edf439019e37 100644 +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -967,6 +967,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) + + kfree_rcu(ifp, rcu); + } ++EXPORT_SYMBOL(inet6_ifa_finish_destroy); + + static void + ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index 14ac1d911287..a3c93ec02c96 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -104,8 +104,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); + } + +-static int inet6_create(struct net *net, struct socket *sock, int protocol, +- int kern) ++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) + { + struct inet_sock *inet; + struct ipv6_pinfo *np; +diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c +index 5352c7e68c42..534a9d2e4858 100644 +--- a/net/ipv6/ipv6_sockglue.c ++++ b/net/ipv6/ipv6_sockglue.c +@@ -44,6 +44,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -221,7 +223,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + sock_prot_inuse_add(net, &tcp_prot, 1); + local_bh_enable(); + sk->sk_prot = &tcp_prot; +- icsk->icsk_af_ops = &ipv4_specific; ++#ifdef CONFIG_MPTCP ++ if (sock_flag(sk, SOCK_MPTCP)) ++ icsk->icsk_af_ops = &mptcp_v4_specific; ++ else ++#endif ++ icsk->icsk_af_ops = &ipv4_specific; + sk->sk_socket->ops = &inet_stream_ops; + sk->sk_family = PF_INET; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); +@@ -345,6 +352,17 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + if (val == -1) + val = 0; + np->tclass = val; ++ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (sk_it->sk_family == AF_INET6) ++ inet6_sk(sk_it)->tclass = val; ++ } ++ } + retv = 0; + break; + +diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c +index ec155844012b..225c015b60a8 100644 +--- a/net/ipv6/syncookies.c ++++ b/net/ipv6/syncookies.c +@@ -15,6 +15,8 @@ + #include + #include + #include ++#include ++#include + #include + + #define COOKIEBITS 24 /* Upper bits store count */ +@@ -106,7 +108,8 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + } + EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); + +-__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) ++__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mssp) + { + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); +@@ -128,6 +131,7 @@ EXPORT_SYMBOL_GPL(__cookie_v6_check); + struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) + { + struct tcp_options_received tcp_opt; ++ struct mptcp_options_received mopt; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct ipv6_pinfo *np = inet6_sk(sk); +@@ -157,7 +161,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) + + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); +- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); ++ mptcp_init_mp_opt(&mopt); ++ tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL); + + if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { + tsoff = secure_tcpv6_ts_off(sock_net(sk), +@@ -170,14 +175,27 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) + goto out; + + ret = NULL; +- req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); ++#ifdef CONFIG_MPTCP ++ if (mopt.saw_mpc) ++ req = inet_reqsk_alloc(&mptcp6_request_sock_ops, sk, false); ++ else ++#endif ++ req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); + if (!req) + goto out; + + ireq = inet_rsk(req); ++ ireq->mptcp_rqsk = 0; ++ ireq->saw_mpc = 0; + treq = tcp_rsk(req); + treq->tfo_listener = false; + ++ /* Must be done before anything else, as it initializes ++ * hash_entry of the MPTCP request-sock. ++ */ ++ if (mopt.saw_mpc) ++ mptcp_cookies_reqsk_init(req, &mopt, skb); ++ + if (security_inet_conn_request(sk, skb, req)) + goto out_free; + +@@ -247,15 +265,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + +- tcp_select_initial_window(sk, full_space, req->mss, +- &req->rsk_rcv_wnd, &req->rsk_window_clamp, +- ireq->wscale_ok, &rcv_wscale, +- dst_metric(dst, RTAX_INITRWND)); ++ tp->ops->select_initial_window(sk, full_space, req->mss, ++ &req->rsk_rcv_wnd, &req->rsk_window_clamp, ++ ireq->wscale_ok, &rcv_wscale, ++ dst_metric(dst, RTAX_INITRWND)); + + ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); + +- ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff); ++ ret = tcp_get_cookie_sock(sk, skb, req, &mopt, dst, tsoff); + out: + return ret; + out_free: +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index 3903cc0ab188..2f91fddabceb 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -58,6 +58,8 @@ + #include + #include + #include ++#include ++#include + #include + + #include +@@ -67,15 +69,6 @@ + #include + + #include +- +-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +-static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, +- struct request_sock *req); +- +-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +- +-static const struct inet_connection_sock_af_ops ipv6_mapped; +-static const struct inet_connection_sock_af_ops ipv6_specific; + #ifdef CONFIG_TCP_MD5SIG + static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; + static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; +@@ -99,7 +92,7 @@ static struct ipv6_pinfo *tcp_inet6_sk(const struct sock *sk) + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); + } + +-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) ++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) + { + struct dst_entry *dst = skb_dst(skb); + +@@ -141,7 +134,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); + } + +-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) + { + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; +@@ -157,6 +150,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int err; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + ++ mptcp_init_connect(sk); ++ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + +@@ -236,7 +231,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + +- icsk->icsk_af_ops = &ipv6_mapped; ++#ifdef CONFIG_MPTCP ++ if (sock_flag(sk, SOCK_MPTCP)) ++ icsk->icsk_af_ops = &mptcp_v6_mapped; ++ else ++#endif ++ icsk->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + #ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_mapped_specific; +@@ -246,7 +246,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; +- icsk->icsk_af_ops = &ipv6_specific; ++#ifdef CONFIG_MPTCP ++ if (sock_flag(sk, SOCK_MPTCP)) ++ icsk->icsk_af_ops = &mptcp_v6_specific; ++ else ++#endif ++ icsk->icsk_af_ops = &ipv6_specific; + sk->sk_backlog_rcv = tcp_v6_do_rcv; + #ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_specific; +@@ -340,7 +345,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + return err; + } + +-static void tcp_v6_mtu_reduced(struct sock *sk) ++void tcp_v6_mtu_reduced(struct sock *sk) + { + struct dst_entry *dst; + u32 mtu; +@@ -376,7 +381,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + struct ipv6_pinfo *np; + struct tcp_sock *tp; + __u32 seq, snd_una; +- struct sock *sk; ++ struct sock *sk, *meta_sk; + bool fatal; + int err; + +@@ -402,8 +407,14 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + return 0; + } + +- bh_lock_sock(sk); +- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) ++ tp = tcp_sk(sk); ++ if (mptcp(tp)) ++ meta_sk = mptcp_meta_sk(sk); ++ else ++ meta_sk = sk; ++ ++ bh_lock_sock(meta_sk); ++ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG) + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) +@@ -414,7 +425,6 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + goto out; + } + +- tp = tcp_sk(sk); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = rcu_dereference(tp->fastopen_rsk); + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; +@@ -454,11 +464,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + + WRITE_ONCE(tp->mtu_info, mtu); + +- if (!sock_owned_by_user(sk)) ++ if (!sock_owned_by_user(meta_sk)) { + tcp_v6_mtu_reduced(sk); +- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, +- &sk->sk_tsq_flags)) +- sock_hold(sk); ++ } else { ++ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, ++ &sk->sk_tsq_flags)) ++ sock_hold(sk); ++ if (mptcp(tp)) ++ mptcp_tsq_flags(sk); ++ } + goto out; + } + +@@ -473,7 +487,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + if (fastopen && !fastopen->sk) + break; + +- if (!sock_owned_by_user(sk)) { ++ if (!sock_owned_by_user(meta_sk)) { + sk->sk_err = err; + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + +@@ -483,14 +497,14 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + goto out; + } + +- if (!sock_owned_by_user(sk) && np->recverr) { ++ if (!sock_owned_by_user(meta_sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + + out: +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + sock_put(sk); + return 0; + } +@@ -538,8 +552,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, + return err; + } + +- +-static void tcp_v6_reqsk_destructor(struct request_sock *req) ++void tcp_v6_reqsk_destructor(struct request_sock *req) + { + kfree(inet_rsk(req)->ipv6_opt); + kfree_skb(inet_rsk(req)->pktopts); +@@ -757,9 +770,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, + return false; + } + +-static void tcp_v6_init_req(struct request_sock *req, +- const struct sock *sk_listener, +- struct sk_buff *skb) ++static int tcp_v6_init_req(struct request_sock *req, ++ const struct sock *sk_listener, ++ struct sk_buff *skb, ++ bool want_cookie) + { + bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); + struct inet_request_sock *ireq = inet_rsk(req); +@@ -781,6 +795,8 @@ static void tcp_v6_init_req(struct request_sock *req, + refcount_inc(&skb->users); + ireq->pktopts = skb; + } ++ ++ return 0; + } + + static struct dst_entry *tcp_v6_route_req(const struct sock *sk, +@@ -800,7 +816,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { + .syn_ack_timeout = tcp_syn_ack_timeout, + }; + +-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { ++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr), + #ifdef CONFIG_TCP_MD5SIG +@@ -818,9 +834,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + }; + + static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, +- u32 ack, u32 win, u32 tsval, u32 tsecr, ++ u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, + int oif, struct tcp_md5sig_key *key, int rst, +- u8 tclass, __be32 label, u32 priority) ++ u8 tclass, __be32 label, u32 priority, int mptcp) + { + const struct tcphdr *th = tcp_hdr(skb); + struct tcphdr *t1; +@@ -839,7 +855,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 + if (key) + tot_len += TCPOLEN_MD5SIG_ALIGNED; + #endif +- ++#ifdef CONFIG_MPTCP ++ if (mptcp) ++ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; ++#endif + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, + GFP_ATOMIC); + if (!buff) +@@ -877,6 +896,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 + tcp_v6_md5_hash_hdr((__u8 *)topt, key, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, t1); ++ topt += 4; ++ } ++#endif ++#ifdef CONFIG_MPTCP ++ if (mptcp) { ++ /* Construction of 32-bit data_ack */ ++ *topt++ = htonl((TCPOPT_MPTCP << 24) | ++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | ++ (0x20 << 8) | ++ (0x01)); ++ *topt++ = htonl(data_ack); + } + #endif + +@@ -935,7 +965,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 + kfree_skb(buff); + } + +-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) ++void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) + { + const struct tcphdr *th = tcp_hdr(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); +@@ -1020,8 +1050,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) + label = ip6_flowlabel(ipv6h); + } + +- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, +- label, priority); ++ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, ++ label, priority, 0); + + #ifdef CONFIG_TCP_MD5SIG + out: +@@ -1030,30 +1060,37 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) + } + + static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, +- u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, ++ u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, u8 tclass, +- __be32 label, u32 priority) ++ __be32 label, u32 priority, int mptcp) + { +- tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, +- tclass, label, priority); ++ tcp_v6_send_response(sk, skb, seq, ack, data_ack, win, tsval, tsecr, oif, ++ key, 0, tclass, label, priority, mptcp); + } + + static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) + { + struct inet_timewait_sock *tw = inet_twsk(sk); + struct tcp_timewait_sock *tcptw = tcp_twsk(sk); ++ u32 data_ack = 0; ++ int mptcp = 0; + ++ if (tcptw->mptcp_tw) { ++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; ++ mptcp = 1; ++ } + tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, ++ data_ack, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), +- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority); ++ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, mptcp); + + inet_twsk_put(tw); + } + +-static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, +- struct request_sock *req) ++void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, ++ struct request_sock *req) + { + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. +@@ -1063,18 +1100,18 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + * exception of segments, MUST be right-shifted by + * Rcv.Wind.Shift bits: + */ +- tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? ++ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, +- tcp_rsk(req)->rcv_nxt, ++ tcp_rsk(req)->rcv_nxt, 0, + req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + req->ts_recent, sk->sk_bound_dev_if, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), +- 0, 0, sk->sk_priority); ++ 0, 0, sk->sk_priority, 0); + } + + +-static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) ++struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) + { + #ifdef CONFIG_SYN_COOKIES + const struct tcphdr *th = tcp_hdr(skb); +@@ -1100,7 +1137,7 @@ u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + return mss; + } + +-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) + { + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_conn_request(sk, skb); +@@ -1131,11 +1168,11 @@ static void tcp_v6_restore_cb(struct sk_buff *skb) + sizeof(struct inet6_skb_parm)); + } + +-static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, +- struct request_sock *req, +- struct dst_entry *dst, +- struct request_sock *req_unhash, +- bool *own_req) ++struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, ++ struct request_sock *req, ++ struct dst_entry *dst, ++ struct request_sock *req_unhash, ++ bool *own_req) + { + struct inet_request_sock *ireq; + struct ipv6_pinfo *newnp; +@@ -1170,7 +1207,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * + + newnp->saddr = newsk->sk_v6_rcv_saddr; + +- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; ++#ifdef CONFIG_MPTCP ++ /* We must check on the request-socket because the listener ++ * socket's flag may have been changed halfway through. ++ */ ++ if (!inet_rsk(req)->saw_mpc) ++ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped; ++ else ++#endif ++ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + newsk->sk_backlog_rcv = tcp_v4_do_rcv; + #ifdef CONFIG_TCP_MD5SIG + newtp->af_specific = &tcp_sock_ipv6_mapped_specific; +@@ -1217,6 +1262,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * + if (!newsk) + goto out_nonewsk; + ++#ifdef CONFIG_MPTCP ++ /* If the meta_sk is v6-mapped we can end up here with the wrong af_ops. ++ * Just make sure that this subflow is v6. ++ */ ++ if (is_meta_sk(sk)) ++ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_specific; ++#endif ++ + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the +@@ -1344,7 +1397,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * + * This is because we cannot sleep with the original spinlock + * held. + */ +-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + { + struct ipv6_pinfo *np = tcp_inet6_sk(sk); + struct sk_buff *opt_skb = NULL; +@@ -1361,6 +1414,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + ++ if (is_meta_sk(sk)) ++ return mptcp_v6_do_rcv(sk, skb); ++ + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. +@@ -1488,6 +1544,10 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); ++#ifdef CONFIG_MPTCP ++ TCP_SKB_CB(skb)->mptcp_flags = 0; ++ TCP_SKB_CB(skb)->dss_off = 0; ++#endif + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); +@@ -1502,8 +1562,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) + int sdif = inet6_sdif(skb); + const struct tcphdr *th; + const struct ipv6hdr *hdr; ++ struct sock *sk, *meta_sk = NULL; + bool refcounted; +- struct sock *sk; + int ret; + struct net *net = dev_net(skb->dev); + +@@ -1557,12 +1617,17 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) + reqsk_put(req); + goto csum_error; + } +- if (unlikely(sk->sk_state != TCP_LISTEN)) { ++ if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) { ++ inet_csk_reqsk_queue_drop_and_put(sk, req); ++ goto lookup; ++ } ++ if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sock_hold(sk); + refcounted = true; ++ + nsk = NULL; + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; +@@ -1621,19 +1686,28 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) + + sk_incoming_cpu_update(sk); + +- bh_lock_sock_nested(sk); ++ if (mptcp(tcp_sk(sk))) { ++ meta_sk = mptcp_meta_sk(sk); ++ ++ bh_lock_sock_nested(meta_sk); ++ if (sock_owned_by_user(meta_sk)) ++ mptcp_prepare_for_backlog(sk, skb); ++ } else { ++ meta_sk = sk; ++ bh_lock_sock_nested(sk); ++ } + tcp_segs_in(tcp_sk(sk), skb); + ret = 0; +- if (!sock_owned_by_user(sk)) { ++ if (!sock_owned_by_user(meta_sk)) { + skb_to_free = sk->sk_rx_skb_cache; + sk->sk_rx_skb_cache = NULL; + ret = tcp_v6_do_rcv(sk, skb); + } else { +- if (tcp_add_backlog(sk, skb)) ++ if (tcp_add_backlog(meta_sk, skb)) + goto discard_and_relse; + skb_to_free = NULL; + } +- bh_unlock_sock(sk); ++ bh_unlock_sock(meta_sk); + if (skb_to_free) + __kfree_skb(skb_to_free); + put_and_return: +@@ -1647,6 +1721,19 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) + + tcp_v6_fill_cb(skb, hdr, th); + ++#ifdef CONFIG_MPTCP ++ if (!sk && th->syn && !th->ack) { ++ int ret = mptcp_lookup_join(skb, NULL); ++ ++ if (ret < 0) { ++ tcp_v6_send_reset(NULL, skb); ++ goto discard_it; ++ } else if (ret > 0) { ++ return 0; ++ } ++ } ++#endif ++ + if (tcp_checksum_complete(skb)) { + csum_error: + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); +@@ -1699,6 +1786,18 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) + refcounted = false; + goto process; + } ++#ifdef CONFIG_MPTCP ++ if (th->syn && !th->ack) { ++ int ret = mptcp_lookup_join(skb, inet_twsk(sk)); ++ ++ if (ret < 0) { ++ tcp_v6_send_reset(NULL, skb); ++ goto discard_it; ++ } else if (ret > 0) { ++ return 0; ++ } ++ } ++#endif + } + /* to ACK */ + /* fall through */ +@@ -1753,13 +1852,13 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) + } + } + +-static struct timewait_sock_ops tcp6_timewait_sock_ops = { ++struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor = tcp_twsk_destructor, + }; + +-static const struct inet_connection_sock_af_ops ipv6_specific = { ++const struct inet_connection_sock_af_ops ipv6_specific = { + .queue_xmit = inet6_csk_xmit, + .send_check = tcp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, +@@ -1790,7 +1889,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { + /* + * TCP over IPv4 via INET6 API + */ +-static const struct inet_connection_sock_af_ops ipv6_mapped = { ++const struct inet_connection_sock_af_ops ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, +@@ -1826,7 +1925,12 @@ static int tcp_v6_init_sock(struct sock *sk) + + tcp_init_sock(sk); + +- icsk->icsk_af_ops = &ipv6_specific; ++#ifdef CONFIG_MPTCP ++ if (sock_flag(sk, SOCK_MPTCP)) ++ icsk->icsk_af_ops = &mptcp_v6_specific; ++ else ++#endif ++ icsk->icsk_af_ops = &ipv6_specific; + + #ifdef CONFIG_TCP_MD5SIG + tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; +@@ -1835,7 +1939,7 @@ static int tcp_v6_init_sock(struct sock *sk) + return 0; + } + +-static void tcp_v6_destroy_sock(struct sock *sk) ++void tcp_v6_destroy_sock(struct sock *sk) + { + tcp_v4_destroy_sock(sk); + inet6_destroy_sock(sk); +@@ -2058,6 +2162,11 @@ struct proto tcpv6_prot = { + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp6_sock), ++#ifdef CONFIG_MPTCP ++ .useroffset = offsetof(struct tcp_sock, mptcp_sched_name), ++ .usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) + ++ sizeof_field(struct tcp_sock, mptcp_pm_name), ++#endif + .slab_flags = SLAB_TYPESAFE_BY_RCU, + .twsk_prot = &tcp6_timewait_sock_ops, + .rsk_prot = &tcp6_request_sock_ops, +@@ -2068,6 +2177,9 @@ struct proto tcpv6_prot = { + .compat_getsockopt = compat_tcp_getsockopt, + #endif + .diag_destroy = tcp_abort, ++#ifdef CONFIG_MPTCP ++ .clear_sk = mptcp_clear_sk, ++#endif + }; + + /* thinking of making this const? Don't. +diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig +new file mode 100644 +index 000000000000..6e05dab4c632 +--- /dev/null ++++ b/net/mptcp/Kconfig +@@ -0,0 +1,154 @@ ++# ++# MPTCP configuration ++# ++config MPTCP ++ bool "MPTCP protocol" ++ depends on (IPV6=y || IPV6=n) ++ select CRYPTO_LIB_SHA256 ++ select CRYPTO ++ ---help--- ++ This replaces the normal TCP stack with a Multipath TCP stack, ++ able to use several paths at once. ++ ++menuconfig MPTCP_PM_ADVANCED ++ bool "MPTCP: advanced path-manager control" ++ depends on MPTCP=y ++ ---help--- ++ Support for selection of different path-managers. You should choose 'Y' here, ++ because otherwise you will not actively create new MPTCP-subflows. ++ ++if MPTCP_PM_ADVANCED ++ ++config MPTCP_FULLMESH ++ tristate "MPTCP Full-Mesh Path-Manager" ++ depends on MPTCP=y ++ ---help--- ++ This path-management module will create a full-mesh among all IP-addresses. ++ ++config MPTCP_NDIFFPORTS ++ tristate "MPTCP ndiff-ports" ++ depends on MPTCP=y ++ ---help--- ++ This path-management module will create multiple subflows between the same ++ pair of IP-addresses, modifying the source-port. You can set the number ++ of subflows via the mptcp_ndiffports-sysctl. ++ ++config MPTCP_BINDER ++ tristate "MPTCP Binder" ++ depends on (MPTCP=y) ++ ---help--- ++ This path-management module works like ndiffports, and adds the sysctl ++ option to set the gateway (and/or path to) per each additional subflow ++ via Loose Source Routing (IPv4 only). ++ ++config MPTCP_NETLINK ++ tristate "MPTCP Netlink Path-Manager" ++ depends on MPTCP=y ++ ---help--- ++ This path-management module is controlled over a Netlink interface. A userspace ++ module can therefore control the establishment of new subflows and the policy ++ to apply over those new subflows for every connection. ++ ++choice ++ prompt "Default MPTCP Path-Manager" ++ default DEFAULT_DUMMY ++ help ++ Select the Path-Manager of your choice ++ ++ config DEFAULT_FULLMESH ++ bool "Full mesh" if MPTCP_FULLMESH=y ++ ++ config DEFAULT_NDIFFPORTS ++ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y ++ ++ config DEFAULT_BINDER ++ bool "binder" if MPTCP_BINDER=y ++ ++ config DEFAULT_NETLINK ++ bool "Netlink" if MPTCP_NETLINK=y ++ ++ config DEFAULT_DUMMY ++ bool "Default" ++ ++endchoice ++ ++endif ++ ++config DEFAULT_MPTCP_PM ++ string ++ default "default" if DEFAULT_DUMMY ++ default "fullmesh" if DEFAULT_FULLMESH ++ default "ndiffports" if DEFAULT_NDIFFPORTS ++ default "binder" if DEFAULT_BINDER ++ default "default" ++ ++menuconfig MPTCP_SCHED_ADVANCED ++ bool "MPTCP: advanced scheduler control" ++ depends on MPTCP=y ++ ---help--- ++ Support for selection of different schedulers. You should choose 'Y' here, ++ if you want to choose a different scheduler than the default one. ++ ++if MPTCP_SCHED_ADVANCED ++ ++config MPTCP_BLEST ++ tristate "MPTCP BLEST" ++ depends on MPTCP=y ++ ---help--- ++ This is an experimental BLocking ESTimation-based (BLEST) scheduler. ++ ++config MPTCP_ROUNDROBIN ++ tristate "MPTCP Round-Robin" ++ depends on (MPTCP=y) ++ ---help--- ++ This is a very simple round-robin scheduler. Probably has bad performance ++ but might be interesting for researchers. ++ ++config MPTCP_REDUNDANT ++ tristate "MPTCP Redundant" ++ depends on (MPTCP=y) ++ ---help--- ++ This scheduler sends all packets redundantly over all subflows to decreases ++ latency and jitter on the cost of lower throughput. ++ ++config MPTCP_ECF ++ tristate "MPTCP ECF" ++ depends on (MPTCP=y) ++ ---help--- ++ This is an experimental Earliest Completion First (ECF) scheduler. ++ ++choice ++ prompt "Default MPTCP Scheduler" ++ default DEFAULT_SCHEDULER ++ help ++ Select the Scheduler of your choice ++ ++ config DEFAULT_SCHEDULER ++ bool "Default" ++ ---help--- ++ This is the default scheduler, sending first on the subflow ++ with the lowest RTT. ++ ++ config DEFAULT_ROUNDROBIN ++ bool "Round-Robin" if MPTCP_ROUNDROBIN=y ++ ---help--- ++ This is the round-rob scheduler, sending in a round-robin ++ fashion.. ++ ++ config DEFAULT_REDUNDANT ++ bool "Redundant" if MPTCP_REDUNDANT=y ++ ---help--- ++ This is the redundant scheduler, sending packets redundantly over ++ all the subflows. ++ ++endchoice ++endif ++ ++config DEFAULT_MPTCP_SCHED ++ string ++ depends on (MPTCP=y) ++ default "default" if DEFAULT_SCHEDULER ++ default "roundrobin" if DEFAULT_ROUNDROBIN ++ default "redundant" if DEFAULT_REDUNDANT ++ default "default" ++ +diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile +new file mode 100644 +index 000000000000..369248a2f68e +--- /dev/null ++++ b/net/mptcp/Makefile +@@ -0,0 +1,25 @@ ++# ++## Makefile for MultiPath TCP support code. ++# ++# ++ ++obj-$(CONFIG_MPTCP) += mptcp.o ++ ++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_pm.o \ ++ mptcp_output.o mptcp_input.o mptcp_sched.o ++ ++obj-$(CONFIG_TCP_CONG_LIA) += mptcp_coupled.o ++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o ++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o ++obj-$(CONFIG_TCP_CONG_BALIA) += mptcp_balia.o ++obj-$(CONFIG_TCP_CONG_MCTCPDESYNC) += mctcp_desync.o ++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o ++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o ++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o ++obj-$(CONFIG_MPTCP_NETLINK) += mptcp_netlink.o ++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o ++obj-$(CONFIG_MPTCP_REDUNDANT) += mptcp_redundant.o ++obj-$(CONFIG_MPTCP_BLEST) += mptcp_blest.o ++obj-$(CONFIG_MPTCP_ECF) += mptcp_ecf.o ++ ++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o +diff --git a/net/mptcp/mctcp_desync.c b/net/mptcp/mctcp_desync.c +new file mode 100644 +index 000000000000..f6bf9251d59b +--- /dev/null ++++ b/net/mptcp/mctcp_desync.c +@@ -0,0 +1,193 @@ ++/* ++ * Desynchronized Multi-Channel TCP Congestion Control Algorithm ++ * ++ * Implementation based on publications of "DMCTCP:Desynchronized Multi-Channel ++ * TCP for high speed access networks with tiny buffers" in 23rd international ++ * conference of Computer Communication and Networks (ICCCN), 2014, and ++ * "Exploring parallelism and desynchronization of TCP over high speed networks ++ * with tiny buffers" in Journal of Computer Communications Elsevier, 2015. ++ * ++ * http://ieeexplore.ieee.org/abstract/document/6911722/ ++ * https://doi.org/10.1016/j.comcom.2015.07.010 ++ * ++ * This prototype is for research purpose and is currently experimental code ++ * that only support a single path. Future support of multi-channel over ++ * multi-path requires channels grouping. ++ * ++ * Initial Design and Implementation: ++ * Cheng Cui ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the Free ++ * Software Foundation; either version 2 of the License, or (at your option) ++ * any later version. ++ */ ++#include ++#include ++#include ++ ++enum { ++ MASTER_CHANNEL = 1, ++ INI_MIN_CWND = 2, ++}; ++ ++/* private congestion control structure: ++ * off_tstamp: the last backoff timestamp for loss synchronization event ++ * off_subfid: the subflow which was backoff on off_tstamp ++ */ ++struct mctcp_desync { ++ u64 off_tstamp; ++ u8 off_subfid; ++}; ++ ++static inline int mctcp_cc_sk_can_send(const struct sock *sk) ++{ ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; ++} ++ ++static void mctcp_desync_init(struct sock *sk) ++{ ++ if (mptcp(tcp_sk(sk))) { ++ struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk)); ++ ca->off_tstamp = 0; ++ ca->off_subfid = 0; ++ } ++ /* If we do not mptcp, behave like reno: return */ ++} ++ ++static void mctcp_desync_cong_avoid(struct sock *sk, u32 ack, u32 acked) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (!mptcp(tp)) { ++ tcp_reno_cong_avoid(sk, ack, acked); ++ return; ++ } else if (!tcp_is_cwnd_limited(sk)) { ++ return; ++ } else { ++ const struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk)); ++ const u8 subfid = tp->mptcp->path_index; ++ ++ /* current aggregated cwnd */ ++ u32 agg_cwnd = 0; ++ u32 min_cwnd = 0xffffffff; ++ u8 min_cwnd_subfid = 0; ++ ++ /* In "safe" area, increase */ ++ if (tcp_in_slow_start(tp)) { ++ if (ca->off_subfid) { ++ /* passed initial phase, allow slow start */ ++ tcp_slow_start(tp, acked); ++ } else if (MASTER_CHANNEL == tp->mptcp->path_index) { ++ /* master channel is normal slow start in ++ * initial phase */ ++ tcp_slow_start(tp, acked); ++ } else { ++ /* secondary channels increase slowly until ++ * the initial phase passed ++ */ ++ tp->snd_ssthresh = tp->snd_cwnd = INI_MIN_CWND; ++ } ++ return; ++ } else { ++ /* In dangerous area, increase slowly and linearly. */ ++ const struct mptcp_tcp_sock *mptcp; ++ ++ /* get total cwnd and the subflow that has min cwnd */ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ ++ if (mctcp_cc_sk_can_send(sub_sk)) { ++ const struct tcp_sock *sub_tp = ++ tcp_sk(sub_sk); ++ agg_cwnd += sub_tp->snd_cwnd; ++ if(min_cwnd > sub_tp->snd_cwnd) { ++ min_cwnd = sub_tp->snd_cwnd; ++ min_cwnd_subfid = ++ sub_tp->mptcp->path_index; ++ } ++ } ++ } ++ /* the smallest subflow grows faster than others */ ++ if (subfid == min_cwnd_subfid) { ++ tcp_cong_avoid_ai(tp, min_cwnd, acked); ++ } else { ++ tcp_cong_avoid_ai(tp, agg_cwnd - min_cwnd, ++ acked); ++ } ++ } ++ } ++} ++ ++static u32 mctcp_desync_ssthresh(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (!mptcp(tp)) { ++ return max(tp->snd_cwnd >> 1U, 2U); ++ } else { ++ struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk)); ++ const u8 subfid = tp->mptcp->path_index; ++ const struct mptcp_tcp_sock *mptcp; ++ u32 max_cwnd = 0; ++ u8 max_cwnd_subfid = 0; ++ ++ /* Find the subflow that has the max cwnd. */ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ ++ if (mctcp_cc_sk_can_send(sub_sk)) { ++ const struct tcp_sock *sub_tp = tcp_sk(sub_sk); ++ if (max_cwnd < sub_tp->snd_cwnd) { ++ max_cwnd = sub_tp->snd_cwnd; ++ max_cwnd_subfid = ++ sub_tp->mptcp->path_index; ++ } ++ } ++ } ++ /* Use high resolution clock. */ ++ if (subfid == max_cwnd_subfid) { ++ u64 now = tcp_clock_us(); ++ u32 delta = tcp_stamp_us_delta(now, ca->off_tstamp); ++ ++ if (delta < (tp->srtt_us >> 3)) { ++ /* desynchronize */ ++ return tp->snd_cwnd; ++ } else { ++ ca->off_tstamp = now; ++ ca->off_subfid = subfid; ++ return max(max_cwnd >> 1U, 2U); ++ } ++ } else { ++ return tp->snd_cwnd; ++ } ++ } ++} ++ ++static struct tcp_congestion_ops mctcp_desync = { ++ .init = mctcp_desync_init, ++ .ssthresh = mctcp_desync_ssthresh, ++ .undo_cwnd = tcp_reno_undo_cwnd, ++ .cong_avoid = mctcp_desync_cong_avoid, ++ .owner = THIS_MODULE, ++ .name = "mctcpdesync", ++}; ++ ++static int __init mctcp_desync_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct mctcp_desync) > ICSK_CA_PRIV_SIZE); ++ return tcp_register_congestion_control(&mctcp_desync); ++} ++ ++static void __exit mctcp_desync_unregister(void) ++{ ++ tcp_unregister_congestion_control(&mctcp_desync); ++} ++ ++module_init(mctcp_desync_register); ++module_exit(mctcp_desync_unregister); ++ ++MODULE_AUTHOR("Cheng Cui"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MCTCP: DESYNCHRONIZED MULTICHANNEL TCP CONGESTION CONTROL"); ++MODULE_VERSION("1.0"); +diff --git a/net/mptcp/mptcp_balia.c b/net/mptcp/mptcp_balia.c +new file mode 100644 +index 000000000000..179b53dea020 +--- /dev/null ++++ b/net/mptcp/mptcp_balia.c +@@ -0,0 +1,261 @@ ++/* ++ * MPTCP implementation - Balia Congestion Control ++ * (Balanced Linked Adaptation Algorithm) ++ * ++ * Analysis, Design and Implementation: ++ * Qiuyu Peng ++ * Anwar Walid ++ * Jaehyun Hwang ++ * Steven H. Low ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++ ++#include ++ ++/* The variable 'rate' (i.e., x_r) will be scaled ++ * e.g., from B/s to KB/s, MB/s, or GB/s ++ * if max_rate > 2^rate_scale_limit ++ */ ++ ++static int rate_scale_limit = 25; ++static int alpha_scale = 10; ++static int scale_num = 5; ++ ++struct mptcp_balia { ++ u64 ai; ++ u64 md; ++ bool forced_update; ++}; ++ ++static inline int mptcp_balia_sk_can_send(const struct sock *sk) ++{ ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; ++} ++ ++static inline u64 mptcp_get_ai(const struct sock *meta_sk) ++{ ++ return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai; ++} ++ ++static inline void mptcp_set_ai(const struct sock *meta_sk, u64 ai) ++{ ++ ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai = ai; ++} ++ ++static inline u64 mptcp_get_md(const struct sock *meta_sk) ++{ ++ return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md; ++} ++ ++static inline void mptcp_set_md(const struct sock *meta_sk, u64 md) ++{ ++ ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md = md; ++} ++ ++static inline u64 mptcp_balia_scale(u64 val, int scale) ++{ ++ return (u64) val << scale; ++} ++ ++static inline bool mptcp_get_forced(const struct sock *meta_sk) ++{ ++ return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update; ++} ++ ++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force) ++{ ++ ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update = force; ++} ++ ++static void mptcp_balia_recalc_ai(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct mptcp_cb *mpcb = tp->mpcb; ++ struct mptcp_tcp_sock *mptcp; ++ u64 max_rate = 0, rate = 0, sum_rate = 0; ++ u64 alpha, ai = tp->snd_cwnd, md = (tp->snd_cwnd >> 1); ++ int num_scale_down = 0; ++ ++ if (!mpcb) ++ return; ++ ++ /* Find max_rate first */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); ++ u64 tmp; ++ ++ if (!mptcp_balia_sk_can_send(sub_sk)) ++ continue; ++ ++ tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd ++ * (USEC_PER_SEC << 3), sub_tp->srtt_us); ++ sum_rate += tmp; ++ ++ if (tp == sub_tp) ++ rate = tmp; ++ ++ if (tmp >= max_rate) ++ max_rate = tmp; ++ } ++ ++ /* At least, the current subflow should be able to send */ ++ if (unlikely(!rate)) ++ goto exit; ++ ++ alpha = div64_u64(max_rate, rate); ++ ++ /* Scale down max_rate if it is too high (e.g., >2^25) */ ++ while (max_rate > mptcp_balia_scale(1, rate_scale_limit)) { ++ max_rate >>= scale_num; ++ num_scale_down++; ++ } ++ ++ if (num_scale_down) { ++ sum_rate = 0; ++ mptcp_for_each_sub(mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); ++ u64 tmp; ++ ++ if (!mptcp_balia_sk_can_send(sub_sk)) ++ continue; ++ ++ tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd ++ * (USEC_PER_SEC << 3), sub_tp->srtt_us); ++ tmp >>= (scale_num * num_scale_down); ++ ++ sum_rate += tmp; ++ } ++ rate >>= (scale_num * num_scale_down); ++ } ++ ++ /* (sum_rate)^2 * 10 * w_r ++ * ai = ------------------------------------ ++ * (x_r + max_rate) * (4x_r + max_rate) ++ */ ++ sum_rate *= sum_rate; ++ ++ ai = div64_u64(sum_rate * 10, rate + max_rate); ++ ai = div64_u64(ai * tp->snd_cwnd, (rate << 2) + max_rate); ++ ++ if (unlikely(!ai)) ++ ai = tp->snd_cwnd; ++ ++ md = ((tp->snd_cwnd >> 1) * min(mptcp_balia_scale(alpha, alpha_scale), ++ mptcp_balia_scale(3, alpha_scale) >> 1)) ++ >> alpha_scale; ++ ++exit: ++ mptcp_set_ai(sk, ai); ++ mptcp_set_md(sk, md); ++} ++ ++static void mptcp_balia_init(struct sock *sk) ++{ ++ if (mptcp(tcp_sk(sk))) { ++ mptcp_set_forced(sk, 0); ++ mptcp_set_ai(sk, 0); ++ mptcp_set_md(sk, 0); ++ } ++} ++ ++static void mptcp_balia_cwnd_event(struct sock *sk, enum tcp_ca_event event) ++{ ++ if (event == CA_EVENT_COMPLETE_CWR || event == CA_EVENT_LOSS) ++ mptcp_balia_recalc_ai(sk); ++} ++ ++static void mptcp_balia_set_state(struct sock *sk, u8 ca_state) ++{ ++ if (!mptcp(tcp_sk(sk))) ++ return; ++ ++ mptcp_set_forced(sk, 1); ++} ++ ++static void mptcp_balia_cong_avoid(struct sock *sk, u32 ack, u32 acked) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ int snd_cwnd; ++ ++ if (!mptcp(tp)) { ++ tcp_reno_cong_avoid(sk, ack, acked); ++ return; ++ } ++ ++ if (!tcp_is_cwnd_limited(sk)) ++ return; ++ ++ if (tcp_in_slow_start(tp)) { ++ /* In "safe" area, increase. */ ++ tcp_slow_start(tp, acked); ++ mptcp_balia_recalc_ai(sk); ++ return; ++ } ++ ++ if (mptcp_get_forced(mptcp_meta_sk(sk))) { ++ mptcp_balia_recalc_ai(sk); ++ mptcp_set_forced(sk, 0); ++ } ++ ++ snd_cwnd = (int)mptcp_get_ai(sk); ++ ++ if (tp->snd_cwnd_cnt >= snd_cwnd) { ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { ++ tp->snd_cwnd++; ++ mptcp_balia_recalc_ai(sk); ++ } ++ ++ tp->snd_cwnd_cnt = 0; ++ } else { ++ tp->snd_cwnd_cnt++; ++ } ++} ++ ++static u32 mptcp_balia_ssthresh(struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (unlikely(!mptcp(tp))) ++ return tcp_reno_ssthresh(sk); ++ else ++ return max((u32)(tp->snd_cwnd - mptcp_get_md(sk)), 1U); ++} ++ ++static struct tcp_congestion_ops mptcp_balia = { ++ .init = mptcp_balia_init, ++ .ssthresh = mptcp_balia_ssthresh, ++ .cong_avoid = mptcp_balia_cong_avoid, ++ .undo_cwnd = tcp_reno_undo_cwnd, ++ .cwnd_event = mptcp_balia_cwnd_event, ++ .set_state = mptcp_balia_set_state, ++ .owner = THIS_MODULE, ++ .name = "balia", ++}; ++ ++static int __init mptcp_balia_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct mptcp_balia) > ICSK_CA_PRIV_SIZE); ++ return tcp_register_congestion_control(&mptcp_balia); ++} ++ ++static void __exit mptcp_balia_unregister(void) ++{ ++ tcp_unregister_congestion_control(&mptcp_balia); ++} ++ ++module_init(mptcp_balia_register); ++module_exit(mptcp_balia_unregister); ++ ++MODULE_AUTHOR("Jaehyun Hwang, Anwar Walid, Qiuyu Peng, Steven H. Low"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MPTCP BALIA CONGESTION CONTROL ALGORITHM"); ++MODULE_VERSION("0.1"); +diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c +new file mode 100644 +index 000000000000..7f34a8d00274 +--- /dev/null ++++ b/net/mptcp/mptcp_binder.c +@@ -0,0 +1,494 @@ ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MPTCP_GW_MAX_LISTS 10 ++#define MPTCP_GW_LIST_MAX_LEN 6 ++#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \ ++ MPTCP_GW_MAX_LISTS) ++ ++struct mptcp_gw_list { ++ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN]; ++ u8 len[MPTCP_GW_MAX_LISTS]; ++}; ++ ++struct binder_priv { ++ /* Worker struct for subflow establishment */ ++ struct work_struct subflow_work; ++ ++ struct mptcp_cb *mpcb; ++ ++ /* Prevent multiple sub-sockets concurrently iterating over sockets */ ++ spinlock_t *flow_lock; ++}; ++ ++static struct mptcp_gw_list *mptcp_gws; ++static rwlock_t mptcp_gws_lock; ++ ++static int mptcp_binder_ndiffports __read_mostly = 1; ++ ++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly; ++ ++static int mptcp_get_avail_list_ipv4(struct sock *sk) ++{ ++ int i, j, list_taken, opt_ret, opt_len; ++ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN]; ++ ++ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ if (mptcp_gws->len[i] == 0) ++ goto error; ++ ++ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i); ++ list_taken = 0; ++ ++ /* Loop through all sub-sockets in this connection */ ++ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { ++ sk = mptcp_to_sock(mptcp); ++ ++ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n"); ++ ++ /* Reset length and options buffer, then retrieve ++ * from socket ++ */ ++ opt_len = MAX_IPOPTLEN; ++ memset(opt, 0, MAX_IPOPTLEN); ++ opt_ret = ip_getsockopt(sk, IPPROTO_IP, ++ IP_OPTIONS, (char __user *)opt, (int __user *)&opt_len); ++ if (opt_ret < 0) { ++ mptcp_debug("%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n", ++ __func__, opt_ret); ++ goto error; ++ } ++ ++ /* If socket has no options, it has no stake in this list */ ++ if (opt_len <= 0) ++ continue; ++ ++ /* Iterate options buffer */ ++ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) { ++ if (*opt_ptr == IPOPT_LSRR) { ++ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n"); ++ goto sock_lsrr; ++ } ++ } ++ continue; ++ ++sock_lsrr: ++ /* Pointer to the 2nd to last address */ ++ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4; ++ ++ /* Addresses start 3 bytes after type offset */ ++ opt_ptr += 3; ++ j = 0; ++ ++ /* Different length lists cannot be the same */ ++ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i]) ++ continue; ++ ++ /* Iterate if we are still inside options list ++ * and sysctl list ++ */ ++ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) { ++ /* If there is a different address, this list must ++ * not be set on this socket ++ */ ++ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4)) ++ break; ++ ++ /* Jump 4 bytes to next address */ ++ opt_ptr += 4; ++ j++; ++ } ++ ++ /* Reached the end without a differing address, lists ++ * are therefore identical. ++ */ ++ if (j == mptcp_gws->len[i]) { ++ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n"); ++ list_taken = 1; ++ break; ++ } ++ } ++ ++ /* Free list found if not taken by a socket */ ++ if (!list_taken) { ++ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n"); ++ break; ++ } ++ } ++ ++ if (i >= MPTCP_GW_MAX_LISTS) ++ goto error; ++ ++ return i; ++error: ++ return -1; ++} ++ ++/* The list of addresses is parsed each time a new connection is opened, ++ * to make sure it's up to date. In case of error, all the lists are ++ * marked as unavailable and the subflow's fingerprint is set to 0. ++ */ ++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr) ++{ ++ int i, j, ret; ++ unsigned char opt[MAX_IPOPTLEN] = {0}; ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0]; ++ ++ /* Read lock: multiple sockets can read LSRR addresses at the same ++ * time, but writes are done in mutual exclusion. ++ * Spin lock: must search for free list for one socket at a time, or ++ * multiple sockets could take the same list. ++ */ ++ read_lock(&mptcp_gws_lock); ++ spin_lock(fmp->flow_lock); ++ ++ i = mptcp_get_avail_list_ipv4(sk); ++ ++ /* Execution enters here only if a free path is found. ++ */ ++ if (i >= 0) { ++ opt[0] = IPOPT_NOP; ++ opt[1] = IPOPT_LSRR; ++ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) * ++ (mptcp_gws->len[i] + 1) + 3; ++ opt[3] = IPOPT_MINOFF; ++ for (j = 0; j < mptcp_gws->len[i]; ++j) ++ memcpy(opt + 4 + ++ (j * sizeof(mptcp_gws->list[i][0].s_addr)), ++ &mptcp_gws->list[i][j].s_addr, ++ sizeof(mptcp_gws->list[i][0].s_addr)); ++ /* Final destination must be part of IP_OPTIONS parameter. */ ++ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr, ++ sizeof(addr.s_addr)); ++ ++ /* setsockopt must be inside the lock, otherwise another ++ * subflow could fail to see that we have taken a list. ++ */ ++ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, (char __user *)opt, ++ 4 + sizeof(mptcp_gws->list[i][0].s_addr) * (mptcp_gws->len[i] + 1)); ++ ++ if (ret < 0) { ++ mptcp_debug("%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n", ++ __func__, ret); ++ } ++ } ++ ++ spin_unlock(fmp->flow_lock); ++ read_unlock(&mptcp_gws_lock); ++ ++ return; ++} ++ ++/* Parses gateways string for a list of paths to different ++ * gateways, and stores them for use with the Loose Source Routing (LSRR) ++ * socket option. Each list must have "," separated addresses, and the lists ++ * themselves must be separated by "-". Returns -1 in case one or more of the ++ * addresses is not a valid ipv4/6 address. ++ */ ++static int mptcp_parse_gateway_ipv4(char *gateways) ++{ ++ int i, j, k, ret; ++ char *tmp_string = NULL; ++ struct in_addr tmp_addr; ++ ++ tmp_string = kzalloc(16, GFP_KERNEL); ++ if (tmp_string == NULL) ++ return -ENOMEM; ++ ++ write_lock(&mptcp_gws_lock); ++ ++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); ++ ++ /* A TMP string is used since inet_pton needs a null terminated string ++ * but we do not want to modify the sysctl for obvious reasons. ++ * i will iterate over the SYSCTL string, j will iterate over the ++ * temporary string where each IP is copied into, k will iterate over ++ * the IPs in each list. ++ */ ++ for (i = j = k = 0; ++ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS; ++ ++i) { ++ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') { ++ /* If the temp IP is empty and the current list is ++ * empty, we are done. ++ */ ++ if (j == 0 && mptcp_gws->len[k] == 0) ++ break; ++ ++ /* Terminate the temp IP string, then if it is ++ * non-empty parse the IP and copy it. ++ */ ++ tmp_string[j] = '\0'; ++ if (j > 0) { ++ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i); ++ ++ ret = in4_pton(tmp_string, strlen(tmp_string), ++ (u8 *)&tmp_addr.s_addr, '\0', ++ NULL); ++ ++ if (ret) { ++ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n", ++ ret, ++ &tmp_addr.s_addr); ++ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr, ++ &tmp_addr.s_addr, ++ sizeof(tmp_addr.s_addr)); ++ mptcp_gws->len[k]++; ++ j = 0; ++ tmp_string[j] = '\0'; ++ /* Since we can't impose a limit to ++ * what the user can input, make sure ++ * there are not too many IPs in the ++ * SYSCTL string. ++ */ ++ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) { ++ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n", ++ k, ++ MPTCP_GW_LIST_MAX_LEN); ++ goto error; ++ } ++ } else { ++ goto error; ++ } ++ } ++ ++ if (gateways[i] == '-' || gateways[i] == '\0') ++ ++k; ++ } else { ++ tmp_string[j] = gateways[i]; ++ ++j; ++ } ++ } ++ ++ /* Number of flows is number of gateway lists plus master flow */ ++ mptcp_binder_ndiffports = k+1; ++ ++ write_unlock(&mptcp_gws_lock); ++ kfree(tmp_string); ++ ++ return 0; ++ ++error: ++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); ++ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN); ++ write_unlock(&mptcp_gws_lock); ++ kfree(tmp_string); ++ return -1; ++} ++ ++/** ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets ++ * ++ * This function uses a goto next_subflow, to allow releasing the lock between ++ * new subflows and giving other processes a chance to do some work on the ++ * socket and potentially finishing the communication. ++ **/ ++static void create_subflow_worker(struct work_struct *work) ++{ ++ const struct binder_priv *pm_priv = container_of(work, ++ struct binder_priv, ++ subflow_work); ++ struct mptcp_cb *mpcb = pm_priv->mpcb; ++ struct sock *meta_sk = mpcb->meta_sk; ++ int iter = 0; ++ ++next_subflow: ++ if (iter) { ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ ++ cond_resched(); ++ } ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ if (!mptcp(tcp_sk(meta_sk))) ++ goto exit; ++ ++ iter++; ++ ++ if (sock_flag(meta_sk, SOCK_DEAD)) ++ goto exit; ++ ++ if (mpcb->master_sk && ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) ++ goto exit; ++ ++ if (mptcp_binder_ndiffports > iter && ++ mptcp_binder_ndiffports > mptcp_subflow_count(mpcb)) { ++ struct mptcp_loc4 loc; ++ struct mptcp_rem4 rem; ++ ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; ++ loc.loc4_id = 0; ++ loc.low_prio = 0; ++ ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; ++ rem.port = inet_sk(meta_sk)->inet_dport; ++ rem.rem4_id = 0; /* Default 0 */ ++ ++ mptcp_init4_subsockets(meta_sk, &loc, &rem); ++ ++ goto next_subflow; ++ } ++ ++exit: ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ sock_put(meta_sk); ++} ++ ++static void binder_new_session(const struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0]; ++ static DEFINE_SPINLOCK(flow_lock); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (meta_sk->sk_family == AF_INET6 && ++ !mptcp_v6_is_v4_mapped(meta_sk)) { ++ mptcp_fallback_default(mpcb); ++ return; ++ } ++#endif ++ ++ /* Initialize workqueue-struct */ ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker); ++ fmp->mpcb = mpcb; ++ ++ fmp->flow_lock = &flow_lock; ++} ++ ++static void binder_create_subflows(struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0]; ++ ++ if (mptcp_in_infinite_mapping_weak(mpcb) || ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) ++ return; ++ ++ if (!work_pending(&pm_priv->subflow_work)) { ++ sock_hold(meta_sk); ++ refcount_inc(&mpcb->mpcb_refcnt); ++ queue_work(mptcp_wq, &pm_priv->subflow_work); ++ } ++} ++ ++static int binder_get_local_id(const struct sock *meta_sk, sa_family_t family, ++ union inet_addr *addr, bool *low_prio) ++{ ++ return 0; ++} ++ ++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated. ++ * Inspired from proc_tcp_congestion_control(). ++ */ ++static int proc_mptcp_gateways(struct ctl_table *ctl, int write, ++ void __user *buffer, size_t *lenp, ++ loff_t *ppos) ++{ ++ int ret; ++ struct ctl_table tbl = { ++ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN, ++ }; ++ ++ if (write) { ++ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL); ++ if (tbl.data == NULL) ++ return -ENOMEM; ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); ++ if (ret == 0) { ++ ret = mptcp_parse_gateway_ipv4(tbl.data); ++ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN); ++ } ++ kfree(tbl.data); ++ } else { ++ ret = proc_dostring(ctl, write, buffer, lenp, ppos); ++ } ++ ++ ++ return ret; ++} ++ ++static struct mptcp_pm_ops binder __read_mostly = { ++ .new_session = binder_new_session, ++ .fully_established = binder_create_subflows, ++ .get_local_id = binder_get_local_id, ++ .init_subsocket_v4 = mptcp_v4_add_lsrr, ++ .name = "binder", ++ .owner = THIS_MODULE, ++}; ++ ++static struct ctl_table binder_table[] = { ++ { ++ .procname = "mptcp_binder_gateways", ++ .data = &sysctl_mptcp_binder_gateways, ++ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN, ++ .mode = 0644, ++ .proc_handler = &proc_mptcp_gateways ++ }, ++ { } ++}; ++ ++static struct ctl_table_header *mptcp_sysctl_binder; ++ ++/* General initialization of MPTCP_PM */ ++static int __init binder_register(void) ++{ ++ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL); ++ if (!mptcp_gws) ++ return -ENOMEM; ++ ++ rwlock_init(&mptcp_gws_lock); ++ ++ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE); ++ ++ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp", ++ binder_table); ++ if (!mptcp_sysctl_binder) ++ goto sysctl_fail; ++ ++ if (mptcp_register_path_manager(&binder)) ++ goto pm_failed; ++ ++ return 0; ++ ++pm_failed: ++ unregister_net_sysctl_table(mptcp_sysctl_binder); ++sysctl_fail: ++ kfree(mptcp_gws); ++ ++ return -1; ++} ++ ++static void binder_unregister(void) ++{ ++ mptcp_unregister_path_manager(&binder); ++ unregister_net_sysctl_table(mptcp_sysctl_binder); ++ kfree(mptcp_gws); ++} ++ ++module_init(binder_register); ++module_exit(binder_unregister); ++ ++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("BINDER MPTCP"); ++MODULE_VERSION("0.1"); +diff --git a/net/mptcp/mptcp_blest.c b/net/mptcp/mptcp_blest.c +new file mode 100644 +index 000000000000..22e25dd0d44e +--- /dev/null ++++ b/net/mptcp/mptcp_blest.c +@@ -0,0 +1,285 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* MPTCP Scheduler to reduce HoL-blocking and spurious retransmissions. ++ * ++ * Algorithm Design: ++ * Simone Ferlin ++ * Ozgu Alay ++ * Olivier Mehani ++ * Roksana Boreli ++ * ++ * Initial Implementation: ++ * Simone Ferlin ++ * ++ * Additional Authors: ++ * Daniel Weber ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++ ++static unsigned char lambda __read_mostly = 12; ++module_param(lambda, byte, 0644); ++MODULE_PARM_DESC(lambda, "Divided by 10 for scaling factor of fast flow rate estimation"); ++ ++static unsigned char max_lambda __read_mostly = 13; ++module_param(max_lambda, byte, 0644); ++MODULE_PARM_DESC(max_lambda, "Divided by 10 for maximum scaling factor of fast flow rate estimation"); ++ ++static unsigned char min_lambda __read_mostly = 10; ++module_param(min_lambda, byte, 0644); ++MODULE_PARM_DESC(min_lambda, "Divided by 10 for minimum scaling factor of fast flow rate estimation"); ++ ++static unsigned char dyn_lambda_good = 10; /* 1% */ ++module_param(dyn_lambda_good, byte, 0644); ++MODULE_PARM_DESC(dyn_lambda_good, "Decrease of lambda in positive case."); ++ ++static unsigned char dyn_lambda_bad = 40; /* 4% */ ++module_param(dyn_lambda_bad, byte, 0644); ++MODULE_PARM_DESC(dyn_lambda_bad, "Increase of lambda in negative case."); ++ ++struct blestsched_priv { ++ u32 last_rbuf_opti; ++ u32 min_srtt_us; ++ u32 max_srtt_us; ++}; ++ ++struct blestsched_cb { ++ s16 lambda_1000; /* values range from min_lambda * 100 to max_lambda * 100 */ ++ u32 last_lambda_update; ++}; ++ ++static struct blestsched_priv *blestsched_get_priv(const struct tcp_sock *tp) ++{ ++ return (struct blestsched_priv *)&tp->mptcp->mptcp_sched[0]; ++} ++ ++static struct blestsched_cb *blestsched_get_cb(const struct tcp_sock *tp) ++{ ++ return (struct blestsched_cb *)&tp->mpcb->mptcp_sched[0]; ++} ++ ++static void blestsched_update_lambda(struct sock *meta_sk, struct sock *sk) ++{ ++ struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(meta_sk)); ++ struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk)); ++ ++ if (tcp_jiffies32 - blest_cb->last_lambda_update < usecs_to_jiffies(blest_p->min_srtt_us >> 3)) ++ return; ++ ++ /* if there have been retransmissions of packets of the slow flow ++ * during the slow flows last RTT => increase lambda ++ * otherwise decrease ++ */ ++ if (tcp_sk(meta_sk)->retrans_stamp) { ++ /* need to slow down on the slow flow */ ++ blest_cb->lambda_1000 += dyn_lambda_bad; ++ } else { ++ /* use the slow flow more */ ++ blest_cb->lambda_1000 -= dyn_lambda_good; ++ } ++ ++ /* cap lambda_1000 to its value range */ ++ blest_cb->lambda_1000 = min_t(s16, blest_cb->lambda_1000, max_lambda * 100); ++ blest_cb->lambda_1000 = max_t(s16, blest_cb->lambda_1000, min_lambda * 100); ++ ++ blest_cb->last_lambda_update = tcp_jiffies32; ++} ++ ++/* how many bytes will sk send during the rtt of another, slower flow? */ ++static u32 blestsched_estimate_bytes(struct sock *sk, u32 time_8) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct blestsched_priv *blest_p = blestsched_get_priv(tp); ++ struct blestsched_cb *blest_cb = blestsched_get_cb(mptcp_meta_tp(tp)); ++ u32 avg_rtt, num_rtts, ca_cwnd, packets; ++ ++ avg_rtt = (blest_p->min_srtt_us + blest_p->max_srtt_us) / 2; ++ if (avg_rtt == 0) ++ num_rtts = 1; /* sanity */ ++ else ++ num_rtts = (time_8 / avg_rtt) + 1; /* round up */ ++ ++ /* during num_rtts, how many bytes will be sent on the flow? ++ * assumes for simplification that Reno is applied as congestion-control ++ */ ++ if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { ++ /* we are in initial slow start */ ++ if (num_rtts > 16) ++ num_rtts = 16; /* cap for sanity */ ++ packets = tp->snd_cwnd * ((1 << num_rtts) - 1); /* cwnd + 2*cwnd + 4*cwnd */ ++ } else { ++ ca_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh + 1); /* assume we jump to CA already */ ++ packets = (ca_cwnd + (num_rtts - 1) / 2) * num_rtts; ++ } ++ ++ return div_u64(((u64)packets) * tp->mss_cache * blest_cb->lambda_1000, 1000); ++} ++ ++static u32 blestsched_estimate_linger_time(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct blestsched_priv *blest_p = blestsched_get_priv(tp); ++ u32 estimate, slope, inflight, cwnd; ++ ++ inflight = tcp_packets_in_flight(tp) + 1; /* take into account the new one */ ++ cwnd = tp->snd_cwnd; ++ ++ if (inflight >= cwnd) { ++ estimate = blest_p->max_srtt_us; ++ } else { ++ slope = blest_p->max_srtt_us - blest_p->min_srtt_us; ++ if (cwnd == 0) ++ cwnd = 1; /* sanity */ ++ estimate = blest_p->min_srtt_us + (slope * inflight) / cwnd; ++ } ++ ++ return (tp->srtt_us > estimate) ? tp->srtt_us : estimate; ++} ++ ++/* This is the BLEST scheduler. This function decides on which flow to send ++ * a given MSS. If all subflows are found to be busy or the currently best ++ * subflow is estimated to possibly cause HoL-blocking, NULL is returned. ++ */ ++struct sock *blest_get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sock *bestsk, *minsk = NULL; ++ struct tcp_sock *meta_tp, *besttp; ++ struct mptcp_tcp_sock *mptcp; ++ struct blestsched_priv *blest_p; ++ u32 min_srtt = U32_MAX; ++ ++ /* Answer data_fin on same subflow!!! */ ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && ++ skb && mptcp_is_data_fin(skb)) { ++ mptcp_for_each_sub(mpcb, mptcp) { ++ bestsk = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index && ++ mptcp_is_available(bestsk, skb, zero_wnd_test)) ++ return bestsk; ++ } ++ } ++ ++ /* First, find the overall best subflow */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ bestsk = mptcp_to_sock(mptcp); ++ besttp = tcp_sk(bestsk); ++ blest_p = blestsched_get_priv(besttp); ++ ++ /* Set of states for which we are allowed to send data */ ++ if (!mptcp_sk_can_send(bestsk)) ++ continue; ++ ++ /* We do not send data on this subflow unless it is ++ * fully established, i.e. the 4th ack has been received. ++ */ ++ if (besttp->mptcp->pre_established) ++ continue; ++ ++ blest_p->min_srtt_us = min(blest_p->min_srtt_us, besttp->srtt_us); ++ blest_p->max_srtt_us = max(blest_p->max_srtt_us, besttp->srtt_us); ++ ++ /* record minimal rtt */ ++ if (besttp->srtt_us < min_srtt) { ++ min_srtt = besttp->srtt_us; ++ minsk = bestsk; ++ } ++ } ++ ++ /* find the current best subflow according to the default scheduler */ ++ bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test); ++ ++ /* if we decided to use a slower flow, we have the option of not using it at all */ ++ if (bestsk && minsk && bestsk != minsk) { ++ u32 slow_linger_time, fast_bytes, slow_inflight_bytes, slow_bytes, avail_space; ++ u32 buffered_bytes = 0; ++ ++ meta_tp = tcp_sk(meta_sk); ++ besttp = tcp_sk(bestsk); ++ ++ blestsched_update_lambda(meta_sk, bestsk); ++ ++ /* if we send this SKB now, it will be acked in besttp->srtt seconds ++ * during this time: how many bytes will we send on the fast flow? ++ */ ++ slow_linger_time = blestsched_estimate_linger_time(bestsk); ++ fast_bytes = blestsched_estimate_bytes(minsk, slow_linger_time); ++ ++ if (skb) ++ buffered_bytes = skb->len; ++ ++ /* is the required space available in the mptcp meta send window? ++ * we assume that all bytes inflight on the slow path will be acked in besttp->srtt seconds ++ * (just like the SKB if it was sent now) -> that means that those inflight bytes will ++ * keep occupying space in the meta window until then ++ */ ++ slow_inflight_bytes = besttp->write_seq - besttp->snd_una; ++ slow_bytes = buffered_bytes + slow_inflight_bytes; // bytes of this SKB plus those in flight already ++ ++ avail_space = (slow_bytes < meta_tp->snd_wnd) ? (meta_tp->snd_wnd - slow_bytes) : 0; ++ ++ if (fast_bytes > avail_space) { ++ /* sending this SKB on the slow flow means ++ * we wouldn't be able to send all the data we'd like to send on the fast flow ++ * so don't do that ++ */ ++ return NULL; ++ } ++ } ++ ++ return bestsk; ++} ++ ++static void blestsched_init(struct sock *sk) ++{ ++ struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk)); ++ struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(mptcp_meta_sk(sk))); ++ ++ blest_p->last_rbuf_opti = tcp_jiffies32; ++ blest_p->min_srtt_us = U32_MAX; ++ blest_p->max_srtt_us = 0; ++ ++ if (!blest_cb->lambda_1000) { ++ blest_cb->lambda_1000 = lambda * 100; ++ blest_cb->last_lambda_update = tcp_jiffies32; ++ } ++} ++ ++static struct mptcp_sched_ops mptcp_sched_blest = { ++ .get_subflow = blest_get_available_subflow, ++ .next_segment = mptcp_next_segment, ++ .init = blestsched_init, ++ .name = "blest", ++ .owner = THIS_MODULE, ++}; ++ ++static int __init blest_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct blestsched_priv) > MPTCP_SCHED_SIZE); ++ BUILD_BUG_ON(sizeof(struct blestsched_cb) > MPTCP_SCHED_DATA_SIZE); ++ ++ if (mptcp_register_scheduler(&mptcp_sched_blest)) ++ return -1; ++ ++ return 0; ++} ++ ++static void blest_unregister(void) ++{ ++ mptcp_unregister_scheduler(&mptcp_sched_blest); ++} ++ ++module_init(blest_register); ++module_exit(blest_unregister); ++ ++MODULE_AUTHOR("Simone Ferlin, Daniel Weber"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("BLEST scheduler for MPTCP, based on default minimum RTT scheduler"); ++MODULE_VERSION("0.95"); +diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c +new file mode 100644 +index 000000000000..9eb7628053f6 +--- /dev/null ++++ b/net/mptcp/mptcp_coupled.c +@@ -0,0 +1,262 @@ ++/* ++ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA) ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++ ++#include ++ ++/* Scaling is done in the numerator with alpha_scale_num and in the denominator ++ * with alpha_scale_den. ++ * ++ * To downscale, we just need to use alpha_scale. ++ * ++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2) ++ */ ++static int alpha_scale_den = 10; ++static int alpha_scale_num = 32; ++static int alpha_scale = 12; ++ ++struct mptcp_ccc { ++ u64 alpha; ++ bool forced_update; ++}; ++ ++static inline int mptcp_ccc_sk_can_send(const struct sock *sk) ++{ ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; ++} ++ ++static inline u64 mptcp_get_alpha(const struct sock *meta_sk) ++{ ++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha; ++} ++ ++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha) ++{ ++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha; ++} ++ ++static inline u64 mptcp_ccc_scale(u32 val, int scale) ++{ ++ return (u64) val << scale; ++} ++ ++static inline bool mptcp_get_forced(const struct sock *meta_sk) ++{ ++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update; ++} ++ ++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force) ++{ ++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force; ++} ++ ++static void mptcp_ccc_recalc_alpha(const struct sock *sk) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ const struct mptcp_tcp_sock *mptcp; ++ int best_cwnd = 0, best_rtt = 0, can_send = 0; ++ u64 max_numerator = 0, sum_denominator = 0, alpha = 1; ++ ++ if (!mpcb) ++ return; ++ ++ /* Do regular alpha-calculation for multiple subflows */ ++ ++ /* Find the max numerator of the alpha-calculation */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); ++ u64 tmp; ++ ++ if (!mptcp_ccc_sk_can_send(sub_sk)) ++ continue; ++ ++ can_send++; ++ ++ /* We need to look for the path, that provides the max-value. ++ * Integer-overflow is not possible here, because ++ * tmp will be in u64. ++ */ ++ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, ++ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us); ++ ++ if (tmp >= max_numerator) { ++ max_numerator = tmp; ++ best_cwnd = sub_tp->snd_cwnd; ++ best_rtt = sub_tp->srtt_us; ++ } ++ } ++ ++ /* No subflow is able to send - we don't care anymore */ ++ if (unlikely(!can_send)) ++ goto exit; ++ ++ /* Calculate the denominator */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); ++ ++ if (!mptcp_ccc_sk_can_send(sub_sk)) ++ continue; ++ ++ sum_denominator += div_u64( ++ mptcp_ccc_scale(sub_tp->snd_cwnd, ++ alpha_scale_den) * best_rtt, ++ sub_tp->srtt_us); ++ } ++ sum_denominator *= sum_denominator; ++ if (unlikely(!sum_denominator)) { ++ pr_err("%s: sum_denominator == 0\n", __func__); ++ mptcp_for_each_sub(mpcb, mptcp) { ++ const struct sock *sub_sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk); ++ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", ++ __func__, sub_tp->mptcp->path_index, ++ sub_sk->sk_state, sub_tp->srtt_us, ++ sub_tp->snd_cwnd); ++ } ++ } ++ ++ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator); ++ ++ if (unlikely(!alpha)) ++ alpha = 1; ++ ++exit: ++ mptcp_set_alpha(mptcp_meta_sk(sk), alpha); ++} ++ ++static void mptcp_ccc_init(struct sock *sk) ++{ ++ if (mptcp(tcp_sk(sk))) { ++ mptcp_set_forced(mptcp_meta_sk(sk), 0); ++ mptcp_set_alpha(mptcp_meta_sk(sk), 1); ++ } ++ /* If we do not mptcp, behave like reno: return */ ++} ++ ++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event) ++{ ++ if (event == CA_EVENT_LOSS) ++ mptcp_ccc_recalc_alpha(sk); ++} ++ ++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state) ++{ ++ if (!mptcp(tcp_sk(sk))) ++ return; ++ ++ mptcp_set_forced(mptcp_meta_sk(sk), 1); ++} ++ ++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ int snd_cwnd; ++ u64 alpha; ++ ++ if (!mptcp(tp)) { ++ tcp_reno_cong_avoid(sk, ack, acked); ++ return; ++ } ++ ++ if (!tcp_is_cwnd_limited(sk)) ++ return; ++ ++ if (tcp_in_slow_start(tp)) { ++ /* In "safe" area, increase. */ ++ tcp_slow_start(tp, acked); ++ mptcp_ccc_recalc_alpha(sk); ++ return; ++ } ++ ++ if (mptcp_get_forced(mptcp_meta_sk(sk))) { ++ mptcp_ccc_recalc_alpha(sk); ++ mptcp_set_forced(mptcp_meta_sk(sk), 0); ++ } ++ ++ alpha = mptcp_get_alpha(mptcp_meta_sk(sk)); ++ ++ /* This may happen, if at the initialization, the mpcb ++ * was not yet attached to the sock, and thus ++ * initializing alpha failed. ++ */ ++ if (unlikely(!alpha)) ++ alpha = 1; ++ ++ snd_cwnd = (int)div_u64((u64)mptcp_ccc_scale(1, alpha_scale), alpha); ++ ++ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd) ++ * Thus, we select here the max value. ++ */ ++ if (snd_cwnd < tp->snd_cwnd) ++ snd_cwnd = tp->snd_cwnd; ++ ++ if (tp->snd_cwnd_cnt >= snd_cwnd) { ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { ++ tp->snd_cwnd++; ++ mptcp_ccc_recalc_alpha(sk); ++ } ++ ++ tp->snd_cwnd_cnt = 0; ++ } else { ++ tp->snd_cwnd_cnt++; ++ } ++} ++ ++static struct tcp_congestion_ops mptcp_ccc = { ++ .init = mptcp_ccc_init, ++ .ssthresh = tcp_reno_ssthresh, ++ .cong_avoid = mptcp_ccc_cong_avoid, ++ .undo_cwnd = tcp_reno_undo_cwnd, ++ .cwnd_event = mptcp_ccc_cwnd_event, ++ .set_state = mptcp_ccc_set_state, ++ .owner = THIS_MODULE, ++ .name = "lia", ++}; ++ ++static int __init mptcp_ccc_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE); ++ return tcp_register_congestion_control(&mptcp_ccc); ++} ++ ++static void __exit mptcp_ccc_unregister(void) ++{ ++ tcp_unregister_congestion_control(&mptcp_ccc); ++} ++ ++module_init(mptcp_ccc_register); ++module_exit(mptcp_ccc_unregister); ++ ++MODULE_AUTHOR("Christoph Paasch, SĂ©bastien BarrĂ©"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM"); ++MODULE_VERSION("0.1"); +diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c +new file mode 100644 +index 000000000000..db01ec142111 +--- /dev/null ++++ b/net/mptcp/mptcp_ctrl.c +@@ -0,0 +1,3313 @@ ++/* ++ * MPTCP implementation - MPTCP-control ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *mptcp_sock_cache __read_mostly; ++static struct kmem_cache *mptcp_cb_cache __read_mostly; ++static struct kmem_cache *mptcp_tw_cache __read_mostly; ++ ++int sysctl_mptcp_enabled __read_mostly = 1; ++int sysctl_mptcp_version __read_mostly = 0; ++static int min_mptcp_version; ++static int max_mptcp_version = 1; ++int sysctl_mptcp_checksum __read_mostly = 1; ++int sysctl_mptcp_debug __read_mostly; ++EXPORT_SYMBOL(sysctl_mptcp_debug); ++int sysctl_mptcp_syn_retries __read_mostly = 3; ++ ++bool mptcp_init_failed __read_mostly; ++ ++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; ++EXPORT_SYMBOL(mptcp_static_key); ++ ++static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn); ++ ++static int proc_mptcp_path_manager(struct ctl_table *ctl, int write, ++ void __user *buffer, size_t *lenp, ++ loff_t *ppos) ++{ ++ char val[MPTCP_PM_NAME_MAX]; ++ struct ctl_table tbl = { ++ .data = val, ++ .maxlen = MPTCP_PM_NAME_MAX, ++ }; ++ int ret; ++ ++ mptcp_get_default_path_manager(val); ++ ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); ++ if (write && ret == 0) ++ ret = mptcp_set_default_path_manager(val); ++ return ret; ++} ++ ++static int proc_mptcp_scheduler(struct ctl_table *ctl, int write, ++ void __user *buffer, size_t *lenp, ++ loff_t *ppos) ++{ ++ char val[MPTCP_SCHED_NAME_MAX]; ++ struct ctl_table tbl = { ++ .data = val, ++ .maxlen = MPTCP_SCHED_NAME_MAX, ++ }; ++ int ret; ++ ++ mptcp_get_default_scheduler(val); ++ ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos); ++ if (write && ret == 0) ++ ret = mptcp_set_default_scheduler(val); ++ return ret; ++} ++ ++static struct ctl_table mptcp_table[] = { ++ { ++ .procname = "mptcp_enabled", ++ .data = &sysctl_mptcp_enabled, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ .procname = "mptcp_version", ++ .data = &sysctl_mptcp_version, ++ .mode = 0644, ++ .maxlen = sizeof(int), ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &min_mptcp_version, ++ .extra2 = &max_mptcp_version, ++ }, ++ { ++ .procname = "mptcp_checksum", ++ .data = &sysctl_mptcp_checksum, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ .procname = "mptcp_debug", ++ .data = &sysctl_mptcp_debug, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ .procname = "mptcp_syn_retries", ++ .data = &sysctl_mptcp_syn_retries, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ .procname = "mptcp_path_manager", ++ .mode = 0644, ++ .maxlen = MPTCP_PM_NAME_MAX, ++ .proc_handler = proc_mptcp_path_manager, ++ }, ++ { ++ .procname = "mptcp_scheduler", ++ .mode = 0644, ++ .maxlen = MPTCP_SCHED_NAME_MAX, ++ .proc_handler = proc_mptcp_scheduler, ++ }, ++ { } ++}; ++ ++static inline u32 mptcp_hash_tk(u32 token, struct mptcp_hashtable *htable) ++{ ++ return token & htable->mask; ++} ++ ++struct mptcp_hashtable mptcp_tk_htable; ++EXPORT_SYMBOL(mptcp_tk_htable); ++ ++/* The following hash table is used to avoid collision of token */ ++static struct mptcp_hashtable mptcp_reqsk_tk_htb; ++ ++/* Lock, protecting the two hash-tables that hold the token. Namely, ++ * mptcp_reqsk_tk_htb and tk_hashtable ++ */ ++static spinlock_t mptcp_tk_hashlock; ++ ++static bool mptcp_reqsk_find_tk(const u32 token) ++{ ++ const u32 hash = mptcp_hash_tk(token, &mptcp_reqsk_tk_htb); ++ const struct mptcp_request_sock *mtreqsk; ++ const struct hlist_nulls_node *node; ++ ++begin: ++ hlist_nulls_for_each_entry_rcu(mtreqsk, node, ++ &mptcp_reqsk_tk_htb.hashtable[hash], ++ hash_entry) { ++ if (token == mtreqsk->mptcp_loc_token) ++ return true; ++ } ++ /* A request-socket is destroyed by RCU. So, it might have been recycled ++ * and put into another hash-table list. So, after the lookup we may ++ * end up in a different list. So, we may need to restart. ++ * ++ * See also the comment in __inet_lookup_established. ++ */ ++ if (get_nulls_value(node) != hash) ++ goto begin; ++ return false; ++} ++ ++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token) ++{ ++ u32 hash = mptcp_hash_tk(token, &mptcp_reqsk_tk_htb); ++ ++ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry, ++ &mptcp_reqsk_tk_htb.hashtable[hash]); ++} ++ ++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk) ++{ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry); ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++} ++ ++void mptcp_reqsk_destructor(struct request_sock *req) ++{ ++ if (!mptcp_rsk(req)->is_sub) ++ mptcp_reqsk_remove_tk(req); ++} ++ ++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token) ++{ ++ u32 hash = mptcp_hash_tk(token, &mptcp_tk_htable); ++ ++ hlist_nulls_add_head_rcu(&meta_tp->tk_table, ++ &mptcp_tk_htable.hashtable[hash]); ++ meta_tp->inside_tk_table = 1; ++} ++ ++static bool mptcp_find_token(u32 token) ++{ ++ const u32 hash = mptcp_hash_tk(token, &mptcp_tk_htable); ++ const struct tcp_sock *meta_tp; ++ const struct hlist_nulls_node *node; ++ ++begin: ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, ++ &mptcp_tk_htable.hashtable[hash], ++ tk_table) { ++ if (token == meta_tp->mptcp_loc_token) ++ return true; ++ } ++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled ++ * and put into another hash-table list. So, after the lookup we may ++ * end up in a different list. So, we may need to restart. ++ * ++ * See also the comment in __inet_lookup_established. ++ */ ++ if (get_nulls_value(node) != hash) ++ goto begin; ++ return false; ++} ++ ++static void mptcp_set_key_reqsk(struct request_sock *req, ++ const struct sk_buff *skb, ++ u32 seed) ++{ ++ const struct inet_request_sock *ireq = inet_rsk(req); ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr, ++ ip_hdr(skb)->daddr, ++ htons(ireq->ir_num), ++ ireq->ir_rmt_port, ++ seed); ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32, ++ ipv6_hdr(skb)->daddr.s6_addr32, ++ htons(ireq->ir_num), ++ ireq->ir_rmt_port, ++ seed); ++#endif ++ } ++ ++ mptcp_key_hash(mtreq->mptcp_ver, mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); ++} ++ ++/* New MPTCP-connection request, prepare a new token for the meta-socket that ++ * will be created in mptcp_check_req_master(), and store the received token. ++ */ ++static void mptcp_reqsk_new_mptcp(struct request_sock *req, ++ const struct sock *sk, ++ const struct mptcp_options_received *mopt, ++ const struct sk_buff *skb) ++{ ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ ++ inet_rsk(req)->saw_mpc = 1; ++ mtreq->mptcp_ver = mopt->mptcp_ver; ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ do { ++ mptcp_set_key_reqsk(req, skb, mptcp_seed++); ++ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || ++ mptcp_find_token(mtreq->mptcp_loc_token)); ++ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token); ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ if (mtreq->mptcp_ver == MPTCP_VERSION_0) { ++ mtreq->mptcp_rem_key = mopt->mptcp_sender_key; ++ mtreq->rem_key_set = 1; ++ } ++} ++ ++static int mptcp_reqsk_new_cookie(struct request_sock *req, ++ const struct sock *sk, ++ const struct mptcp_options_received *mopt, ++ const struct sk_buff *skb) ++{ ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ ++ /* Must happen before mptcp_set_key_reqsk to generate the token with ++ * the proper hash algo. ++ */ ++ mtreq->mptcp_ver = mopt->mptcp_ver; ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ ++ mptcp_set_key_reqsk(req, skb, tcp_rsk(req)->snt_isn); ++ ++ if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || ++ mptcp_find_token(mtreq->mptcp_loc_token)) { ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ return false; ++ } ++ ++ inet_rsk(req)->saw_mpc = 1; ++ ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ if (mtreq->mptcp_ver == MPTCP_VERSION_0) { ++ mtreq->mptcp_rem_key = mopt->mptcp_sender_key; ++ mtreq->rem_key_set = 1; ++ } ++ ++ return true; ++} ++ ++static void mptcp_set_key_sk(const struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ const struct inet_sock *isk = inet_sk(sk); ++ ++ if (sk->sk_family == AF_INET) ++ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr, ++ isk->inet_daddr, ++ isk->inet_sport, ++ isk->inet_dport, ++ mptcp_seed++); ++#if IS_ENABLED(CONFIG_IPV6) ++ else ++ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32, ++ sk->sk_v6_daddr.s6_addr32, ++ isk->inet_sport, ++ isk->inet_dport, ++ mptcp_seed++); ++#endif ++ ++ mptcp_key_hash(tp->mptcp_ver, tp->mptcp_loc_key, &tp->mptcp_loc_token, NULL); ++} ++ ++#ifdef CONFIG_JUMP_LABEL ++static atomic_t mptcp_needed_deferred; ++static atomic_t mptcp_wanted; ++ ++static void mptcp_clear(struct work_struct *work) ++{ ++ int deferred = atomic_xchg(&mptcp_needed_deferred, 0); ++ int wanted; ++ ++ wanted = atomic_add_return(deferred, &mptcp_wanted); ++ if (wanted > 0) ++ static_key_enable(&mptcp_static_key); ++ else ++ static_key_disable(&mptcp_static_key); ++} ++ ++static DECLARE_WORK(mptcp_work, mptcp_clear); ++#endif ++ ++static void mptcp_enable_static_key_bh(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&mptcp_wanted); ++ if (wanted <= 0) ++ break; ++ if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted + 1) == wanted) ++ return; ++ } ++ atomic_inc(&mptcp_needed_deferred); ++ schedule_work(&mptcp_work); ++#else ++ static_key_slow_inc(&mptcp_static_key); ++#endif ++} ++ ++static void mptcp_enable_static_key(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ atomic_inc(&mptcp_wanted); ++ static_key_enable(&mptcp_static_key); ++#else ++ static_key_slow_inc(&mptcp_static_key); ++#endif ++} ++ ++void mptcp_disable_static_key(void) ++{ ++#ifdef CONFIG_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&mptcp_wanted); ++ if (wanted <= 1) ++ break; ++ if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted - 1) == wanted) ++ return; ++ } ++ atomic_dec(&mptcp_needed_deferred); ++ schedule_work(&mptcp_work); ++#else ++ static_key_slow_dec(&mptcp_static_key); ++#endif ++} ++ ++void mptcp_enable_sock(struct sock *sk) ++{ ++ if (!sock_flag(sk, SOCK_MPTCP)) { ++ sock_set_flag(sk, SOCK_MPTCP); ++ tcp_sk(sk)->mptcp_ver = sysctl_mptcp_version; ++ ++ /* Necessary here, because MPTCP can be enabled/disabled through ++ * a setsockopt. ++ */ ++ if (sk->sk_family == AF_INET) ++ inet_csk(sk)->icsk_af_ops = &mptcp_v4_specific; ++#if IS_ENABLED(CONFIG_IPV6) ++ else if (mptcp_v6_is_v4_mapped(sk)) ++ inet_csk(sk)->icsk_af_ops = &mptcp_v6_mapped; ++ else ++ inet_csk(sk)->icsk_af_ops = &mptcp_v6_specific; ++#endif ++ ++ mptcp_enable_static_key(); ++ } ++} ++ ++void mptcp_disable_sock(struct sock *sk) ++{ ++ if (sock_flag(sk, SOCK_MPTCP)) { ++ sock_reset_flag(sk, SOCK_MPTCP); ++ ++ /* Necessary here, because MPTCP can be enabled/disabled through ++ * a setsockopt. ++ */ ++ if (sk->sk_family == AF_INET) ++ inet_csk(sk)->icsk_af_ops = &ipv4_specific; ++#if IS_ENABLED(CONFIG_IPV6) ++ else if (mptcp_v6_is_v4_mapped(sk)) ++ inet_csk(sk)->icsk_af_ops = &ipv6_mapped; ++ else ++ inet_csk(sk)->icsk_af_ops = &ipv6_specific; ++#endif ++ ++ mptcp_disable_static_key(); ++ } ++} ++ ++void mptcp_connect_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ do { ++ mptcp_set_key_sk(sk); ++ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) || ++ mptcp_find_token(tp->mptcp_loc_token)); ++ ++ __mptcp_hash_insert(tp, tp->mptcp_loc_token); ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); ++} ++ ++/** ++ * This function increments the refcount of the mpcb struct. ++ * It is the responsibility of the caller to decrement when releasing ++ * the structure. ++ */ ++struct sock *mptcp_hash_find(const struct net *net, const u32 token) ++{ ++ const u32 hash = mptcp_hash_tk(token, &mptcp_tk_htable); ++ const struct tcp_sock *meta_tp; ++ struct sock *meta_sk = NULL; ++ const struct hlist_nulls_node *node; ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++begin: ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, ++ &mptcp_tk_htable.hashtable[hash], ++ tk_table) { ++ meta_sk = (struct sock *)meta_tp; ++ if (token == meta_tp->mptcp_loc_token && ++ net_eq(net, sock_net(meta_sk))) { ++ if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt))) ++ goto out; ++ if (unlikely(token != meta_tp->mptcp_loc_token || ++ !net_eq(net, sock_net(meta_sk)))) { ++ sock_gen_put(meta_sk); ++ goto begin; ++ } ++ goto found; ++ } ++ } ++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled ++ * and put into another hash-table list. So, after the lookup we may ++ * end up in a different list. So, we may need to restart. ++ * ++ * See also the comment in __inet_lookup_established. ++ */ ++ if (get_nulls_value(node) != hash) ++ goto begin; ++out: ++ meta_sk = NULL; ++found: ++ local_bh_enable(); ++ rcu_read_unlock(); ++ return meta_sk; ++} ++EXPORT_SYMBOL_GPL(mptcp_hash_find); ++ ++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) ++{ ++ /* remove from the token hashtable */ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ hlist_nulls_del_init_rcu(&meta_tp->tk_table); ++ meta_tp->inside_tk_table = 0; ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++} ++ ++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk) ++{ ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct sock *rttsk = NULL, *lastsk = NULL; ++ u32 min_time = 0, last_active = 0; ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 elapsed; ++ ++ if (!mptcp_sk_can_send_ack(sk) || tp->pf) ++ continue; ++ ++ elapsed = keepalive_time_elapsed(tp); ++ ++ /* We take the one with the lowest RTT within a reasonable ++ * (meta-RTO)-timeframe ++ */ ++ if (elapsed < inet_csk(meta_sk)->icsk_rto) { ++ if (!min_time || tp->srtt_us < min_time) { ++ min_time = tp->srtt_us; ++ rttsk = sk; ++ } ++ continue; ++ } ++ ++ /* Otherwise, we just take the most recent active */ ++ if (!rttsk && (!last_active || elapsed < last_active)) { ++ last_active = elapsed; ++ lastsk = sk; ++ } ++ } ++ ++ if (rttsk) ++ return rttsk; ++ ++ return lastsk; ++} ++EXPORT_SYMBOL(mptcp_select_ack_sock); ++ ++static void mptcp_sock_def_error_report(struct sock *sk) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++ if (tp->send_mp_fclose && sk->sk_err == ETIMEDOUT) { ++ /* Called by the keep alive timer (tcp_write_timeout), ++ * when the limit of fastclose retransmissions has been ++ * reached. Send a TCP RST to clear the status of any ++ * stateful firewall (typically conntrack) which are ++ * not aware of mptcp and cannot understand the ++ * fastclose option. ++ */ ++ tp->ops->send_active_reset(sk, GFP_ATOMIC); ++ } ++ } ++ ++ /* record this info that can be used by PM after the sf close */ ++ tp->mptcp->sk_err = sk->sk_err; ++ ++ if (!tp->tcp_disconnect && mptcp_in_infinite_mapping_weak(mpcb)) { ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ meta_sk->sk_err = sk->sk_err; ++ meta_sk->sk_err_soft = sk->sk_err_soft; ++ ++ if (!sock_flag(meta_sk, SOCK_DEAD)) ++ meta_sk->sk_error_report(meta_sk); ++ ++ WARN(meta_sk->sk_state == TCP_CLOSE, ++ "Meta already closed i_rcv %u i_snd %u send_i %u flags %#lx\n", ++ mpcb->infinite_mapping_rcv, mpcb->infinite_mapping_snd, ++ mpcb->send_infinite_mapping, meta_sk->sk_flags); ++ ++ if (meta_sk->sk_state != TCP_CLOSE) ++ tcp_done(meta_sk); ++ } ++ ++ sk->sk_err = 0; ++ return; ++} ++ ++void mptcp_mpcb_put(struct mptcp_cb *mpcb) ++{ ++ if (refcount_dec_and_test(&mpcb->mpcb_refcnt)) { ++ mptcp_cleanup_path_manager(mpcb); ++ mptcp_cleanup_scheduler(mpcb); ++ kfree(mpcb->master_info); ++ kmem_cache_free(mptcp_cb_cache, mpcb); ++ } ++} ++EXPORT_SYMBOL(mptcp_mpcb_put); ++ ++static void mptcp_mpcb_cleanup(struct mptcp_cb *mpcb) ++{ ++ struct mptcp_tw *mptw; ++ ++ /* The mpcb is disappearing - we can make the final ++ * update to the rcv_nxt of the time-wait-sock and remove ++ * its reference to the mpcb. ++ */ ++ spin_lock_bh(&mpcb->mpcb_list_lock); ++ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) { ++ list_del_rcu(&mptw->list); ++ mptw->in_list = 0; ++ mptcp_mpcb_put(mpcb); ++ rcu_assign_pointer(mptw->mpcb, NULL); ++ } ++ spin_unlock_bh(&mpcb->mpcb_list_lock); ++ ++ mptcp_mpcb_put(mpcb); ++} ++ ++static void mptcp_sock_destruct(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (!is_meta_sk(sk)) { ++ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list)); ++ ++ kmem_cache_free(mptcp_sock_cache, tp->mptcp); ++ tp->mptcp = NULL; ++ ++ /* Taken when mpcb pointer was set */ ++ sock_put(mptcp_meta_sk(sk)); ++ mptcp_mpcb_put(tp->mpcb); ++ } else { ++ mptcp_debug("%s destroying meta-sk token %#x\n", __func__, ++ tcp_sk(sk)->mpcb->mptcp_loc_token); ++ ++ mptcp_mpcb_cleanup(tp->mpcb); ++ } ++ ++ WARN_ON(!static_key_false(&mptcp_static_key)); ++ ++ /* Must be called here, because this will decrement the jump-label. */ ++ inet_sock_destruct(sk); ++} ++ ++void mptcp_destroy_sock(struct sock *sk) ++{ ++ if (is_meta_sk(sk)) { ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue); ++ ++ /* We have to close all remaining subflows. Normally, they ++ * should all be about to get closed. But, if the kernel is ++ * forcing a closure (e.g., tcp_write_err), the subflows might ++ * not have been closed properly (as we are waiting for the ++ * DATA_ACK of the DATA_FIN). ++ */ ++ mptcp_for_each_sub_safe(tcp_sk(sk)->mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ /* Already did call tcp_close - waiting for graceful ++ * closure, or if we are retransmitting fast-close on ++ * the subflow. The reset (or timeout) will kill the ++ * subflow.. ++ */ ++ if (tcp_sk(sk_it)->closing || ++ tcp_sk(sk_it)->send_mp_fclose) ++ continue; ++ ++ /* Allow the delayed work first to prevent time-wait state */ ++ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work)) ++ continue; ++ ++ mptcp_sub_close(sk_it, 0); ++ } ++ } else { ++ mptcp_del_sock(sk); ++ } ++} ++ ++static void mptcp_set_state(struct sock *sk) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ /* Meta is not yet established - wake up the application */ ++ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) && ++ sk->sk_state == TCP_ESTABLISHED) { ++ tcp_set_state(meta_sk, TCP_ESTABLISHED); ++ ++ if (!sock_flag(meta_sk, SOCK_DEAD)) { ++ meta_sk->sk_state_change(meta_sk); ++ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT); ++ } ++ ++ tcp_sk(meta_sk)->lsndtime = tcp_jiffies32; ++ } ++ ++ if (sk->sk_state == TCP_CLOSE) { ++ if (!sock_flag(sk, SOCK_DEAD)) ++ mptcp_sub_close(sk, 0); ++ } ++} ++ ++static int mptcp_set_congestion_control(struct sock *meta_sk, const char *name, ++ bool load, bool reinit, bool cap_net_admin) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ int err, result = 0; ++ ++ result = __tcp_set_congestion_control(meta_sk, name, load, reinit, cap_net_admin); ++ ++ tcp_sk(meta_sk)->mpcb->tcp_ca_explicit_set = true; ++ ++ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ err = __tcp_set_congestion_control(sk_it, name, load, reinit, cap_net_admin); ++ if (err) ++ result = err; ++ } ++ return result; ++} ++ ++static void mptcp_assign_congestion_control(struct sock *sk) ++{ ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk)); ++ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops; ++ ++ /* Congestion control is the same as meta. Thus, it has been ++ * try_module_get'd by tcp_assign_congestion_control. ++ * Congestion control on meta was not explicitly configured by ++ * application, leave default or route based. ++ */ ++ if (icsk->icsk_ca_ops == ca || ++ !tcp_sk(mptcp_meta_sk(sk))->mpcb->tcp_ca_explicit_set) ++ return; ++ ++ /* Use the same congestion control as set on the meta-sk */ ++ if (!try_module_get(ca->owner)) { ++ /* This should never happen. The congestion control is linked ++ * to the meta-socket (through tcp_assign_congestion_control) ++ * who "holds" the refcnt on the module. ++ */ ++ WARN(1, "Could not get the congestion control!"); ++ return; ++ } ++ module_put(icsk->icsk_ca_ops->owner); ++ icsk->icsk_ca_ops = ca; ++ ++ /* Clear out private data before diag gets it and ++ * the ca has not been initialized. ++ */ ++ if (ca->get_info) ++ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); ++ ++ return; ++} ++ ++siphash_key_t mptcp_secret __read_mostly; ++u32 mptcp_seed = 0; ++ ++#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) ++ ++static void mptcp_key_sha256(const u64 key, u32 *token, u64 *idsn) ++{ ++ u32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; ++ struct sha256_state state; ++ ++ sha256_init(&state); ++ sha256_update(&state, (const u8 *)&key, sizeof(key)); ++ sha256_final(&state, (u8 *)mptcp_hashed_key); ++ ++ if (token) ++ *token = mptcp_hashed_key[0]; ++ if (idsn) ++ *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[6])); ++} ++ ++static void mptcp_hmac_sha256(const u8 *key_1, const u8 *key_2, u8 *hash_out, ++ int arg_num, va_list list) ++{ ++ u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; ++ struct sha256_state state; ++ int index, msg_length; ++ int length = 0; ++ u8 *msg; ++ int i; ++ ++ /* Generate key xored with ipad */ ++ memset(input, 0x36, SHA256_BLOCK_SIZE); ++ for (i = 0; i < 8; i++) ++ input[i] ^= key_1[i]; ++ for (i = 0; i < 8; i++) ++ input[i + 8] ^= key_2[i]; ++ ++ index = SHA256_BLOCK_SIZE; ++ msg_length = 0; ++ for (i = 0; i < arg_num; i++) { ++ length = va_arg(list, int); ++ msg = va_arg(list, u8 *); ++ BUG_ON(index + length >= sizeof(input)); /* Message is too long */ ++ memcpy(&input[index], msg, length); ++ index += length; ++ msg_length += length; ++ } ++ ++ sha256_init(&state); ++ sha256_update(&state, input, SHA256_BLOCK_SIZE + msg_length); ++ sha256_final(&state, &input[SHA256_BLOCK_SIZE]); ++ ++ /* Prepare second part of hmac */ ++ memset(input, 0x5C, SHA256_BLOCK_SIZE); ++ for (i = 0; i < 8; i++) ++ input[i] ^= key_1[i]; ++ for (i = 0; i < 8; i++) ++ input[i + 8] ^= key_2[i]; ++ ++ sha256_init(&state); ++ sha256_update(&state, input, sizeof(input)); ++ sha256_final(&state, hash_out); ++} ++ ++static void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) ++{ ++ u32 workspace[SHA_WORKSPACE_WORDS]; ++ u32 mptcp_hashed_key[SHA_DIGEST_WORDS]; ++ u8 input[64]; ++ int i; ++ ++ memset(workspace, 0, sizeof(workspace)); ++ ++ /* Initialize input with appropriate padding */ ++ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte ++ * is explicitly set too ++ */ ++ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */ ++ input[8] = 0x80; /* Padding: First bit after message = 1 */ ++ input[63] = 0x40; /* Padding: Length of the message = 64 bits */ ++ ++ sha_init(mptcp_hashed_key); ++ sha_transform(mptcp_hashed_key, input, workspace); ++ ++ for (i = 0; i < 5; i++) ++ mptcp_hashed_key[i] = (__force u32)cpu_to_be32(mptcp_hashed_key[i]); ++ ++ if (token) ++ *token = mptcp_hashed_key[0]; ++ if (idsn) ++ *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[3])); ++} ++ ++static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn) ++{ ++ if (version == MPTCP_VERSION_0) ++ mptcp_key_sha1(key, token, idsn); ++ else if (version >= MPTCP_VERSION_1) ++ mptcp_key_sha256(key, token, idsn); ++} ++ ++static void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out, ++ int arg_num, va_list list) ++{ ++ u32 workspace[SHA_WORKSPACE_WORDS]; ++ u8 input[128]; /* 2 512-bit blocks */ ++ int i; ++ int index; ++ int length; ++ u8 *msg; ++ ++ memset(workspace, 0, sizeof(workspace)); ++ ++ /* Generate key xored with ipad */ ++ memset(input, 0x36, 64); ++ for (i = 0; i < 8; i++) ++ input[i] ^= key_1[i]; ++ for (i = 0; i < 8; i++) ++ input[i + 8] ^= key_2[i]; ++ ++ index = 64; ++ for (i = 0; i < arg_num; i++) { ++ length = va_arg(list, int); ++ msg = va_arg(list, u8 *); ++ BUG_ON(index + length > 125); /* Message is too long */ ++ memcpy(&input[index], msg, length); ++ index += length; ++ } ++ ++ input[index] = 0x80; /* Padding: First bit after message = 1 */ ++ memset(&input[index + 1], 0, (126 - index)); ++ ++ /* Padding: Length of the message = 512 + message length (bits) */ ++ input[126] = 0x02; ++ input[127] = ((index - 64) * 8); /* Message length (bits) */ ++ ++ sha_init(hash_out); ++ sha_transform(hash_out, input, workspace); ++ memset(workspace, 0, sizeof(workspace)); ++ ++ sha_transform(hash_out, &input[64], workspace); ++ memset(workspace, 0, sizeof(workspace)); ++ ++ for (i = 0; i < 5; i++) ++ hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]); ++ ++ /* Prepare second part of hmac */ ++ memset(input, 0x5C, 64); ++ for (i = 0; i < 8; i++) ++ input[i] ^= key_1[i]; ++ for (i = 0; i < 8; i++) ++ input[i + 8] ^= key_2[i]; ++ ++ memcpy(&input[64], hash_out, 20); ++ input[84] = 0x80; ++ memset(&input[85], 0, 41); ++ ++ /* Padding: Length of the message = 512 + 160 bits */ ++ input[126] = 0x02; ++ input[127] = 0xA0; ++ ++ sha_init(hash_out); ++ sha_transform(hash_out, input, workspace); ++ memset(workspace, 0, sizeof(workspace)); ++ ++ sha_transform(hash_out, &input[64], workspace); ++ ++ for (i = 0; i < 5; i++) ++ hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]); ++} ++ ++void mptcp_hmac(u8 ver, const u8 *key_1, const u8 *key_2, u8 *hash_out, ++ int arg_num, ...) ++{ ++ va_list args; ++ ++ va_start(args, arg_num); ++ if (ver == MPTCP_VERSION_0) ++ mptcp_hmac_sha1(key_1, key_2, (u32 *)hash_out, arg_num, args); ++ else if (ver >= MPTCP_VERSION_1) ++ mptcp_hmac_sha256(key_1, key_2, hash_out, arg_num, args); ++ va_end(args); ++} ++EXPORT_SYMBOL(mptcp_hmac); ++ ++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) ++{ ++ /* Socket-options handled by sk_clone_lock while creating the meta-sk. ++ * ====== ++ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT, ++ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER, ++ * TCP_NODELAY, TCP_CORK ++ * ++ * Socket-options handled in this function here ++ * ====== ++ * TCP_DEFER_ACCEPT ++ * SO_KEEPALIVE ++ * ++ * Socket-options on the todo-list ++ * ====== ++ * SO_BINDTODEVICE - should probably prevent creation of new subsocks ++ * across other devices. - what about the api-draft? ++ * SO_DEBUG ++ * SO_REUSEADDR - probably we don't care about this ++ * SO_DONTROUTE, SO_BROADCAST ++ * SO_OOBINLINE ++ * SO_LINGER ++ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM ++ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM ++ * SO_RXQ_OVFL ++ * TCP_COOKIE_TRANSACTIONS ++ * TCP_MAXSEG ++ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this ++ * in mptcp_meta_retransmit_timer. AND we need to check ++ * what is about the subsockets. ++ * TCP_LINGER2 ++ * TCP_WINDOW_CLAMP ++ * TCP_USER_TIMEOUT ++ * TCP_MD5SIG ++ * ++ * Socket-options of no concern for the meta-socket (but for the subsocket) ++ * ====== ++ * SO_PRIORITY ++ * SO_MARK ++ * TCP_CONGESTION ++ * TCP_SYNCNT ++ * TCP_QUICKACK ++ */ ++ ++ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */ ++ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0; ++ ++ /* Keepalives are handled entirely at the MPTCP-layer */ ++ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { ++ inet_csk_reset_keepalive_timer(meta_sk, ++ keepalive_time_when(tcp_sk(meta_sk))); ++ sock_reset_flag(master_sk, SOCK_KEEPOPEN); ++ inet_csk_delete_keepalive_timer(master_sk); ++ } ++ ++ /* Do not propagate subflow-errors up to the MPTCP-layer */ ++ inet_sk(master_sk)->recverr = 0; ++} ++ ++/* Called without holding lock on meta_sk */ ++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk) ++{ ++ __u8 meta_tos; ++ ++ /* IP_TOS also goes to the subflow. */ ++ meta_tos = READ_ONCE(inet_sk(meta_sk)->tos); ++ if (inet_sk(sub_sk)->tos != meta_tos) { ++ inet_sk(sub_sk)->tos = meta_tos; ++ sub_sk->sk_priority = meta_sk->sk_priority; ++ sk_dst_reset(sub_sk); ++ } ++ ++ /* IPV6_TCLASS */ ++ if (sub_sk->sk_family == AF_INET6 && meta_sk->sk_family == AF_INET6) ++ inet6_sk(sub_sk)->tclass = inet6_sk(meta_sk)->tclass; ++ ++ /* Inherit SO_REUSEADDR */ ++ sub_sk->sk_reuse = meta_sk->sk_reuse; ++ ++ /* Inherit SO_MARK: can be used for routing or filtering */ ++ sub_sk->sk_mark = meta_sk->sk_mark; ++ ++ /* Inherit snd/rcv-buffer locks */ ++ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; ++ ++ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */ ++ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH; ++ ++ /* Keepalives are handled entirely at the MPTCP-layer */ ++ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) { ++ sock_reset_flag(sub_sk, SOCK_KEEPOPEN); ++ inet_csk_delete_keepalive_timer(sub_sk); ++ } ++ ++ /* Do not propagate subflow-errors up to the MPTCP-layer */ ++ inet_sk(sub_sk)->recverr = 0; ++} ++ ++void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb) ++{ ++ /* In case of success (in mptcp_backlog_rcv) and error (in kfree_skb) of ++ * sk_add_backlog, we will decrement the sk refcount. ++ */ ++ sock_hold(sk); ++ skb->sk = sk; ++ skb->destructor = sock_efree; ++} ++ ++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ /* skb-sk may be NULL if we receive a packet immediatly after the ++ * SYN/ACK + MP_CAPABLE. ++ */ ++ struct sock *sk = skb->sk ? skb->sk : meta_sk; ++ int ret = 0; ++ ++ if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) { ++ kfree_skb(skb); ++ return 0; ++ } ++ ++ /* Decrement sk refcnt when calling the skb destructor. ++ * Refcnt is incremented and skb destructor is set in tcp_v{4,6}_rcv via ++ * mptcp_prepare_for_backlog() here above. ++ */ ++ skb_orphan(skb); ++ ++ if (sk->sk_family == AF_INET) ++ ret = tcp_v4_do_rcv(sk, skb); ++#if IS_ENABLED(CONFIG_IPV6) ++ else ++ ret = tcp_v6_do_rcv(sk, skb); ++#endif ++ ++ sock_put(sk); ++ return ret; ++} ++ ++static void mptcp_init_buffer_space(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ int space; ++ ++ tcp_init_buffer_space(sk); ++ ++ if (is_master_tp(tp)) { ++ meta_tp->rcvq_space.space = meta_tp->rcv_wnd; ++ tcp_mstamp_refresh(meta_tp); ++ meta_tp->rcvq_space.time = meta_tp->tcp_mstamp; ++ meta_tp->rcvq_space.seq = meta_tp->copied_seq; ++ ++ /* If there is only one subflow, we just use regular TCP ++ * autotuning. User-locks are handled already by ++ * tcp_init_buffer_space ++ */ ++ meta_tp->window_clamp = tp->window_clamp; ++ meta_tp->rcv_ssthresh = tp->rcv_ssthresh; ++ meta_sk->sk_rcvbuf = sk->sk_rcvbuf; ++ meta_sk->sk_sndbuf = sk->sk_sndbuf; ++ ++ return; ++ } ++ ++ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK) ++ goto snd_buf; ++ ++ /* Adding a new subflow to the rcv-buffer space. We make a simple ++ * addition, to give some space to allow traffic on the new subflow. ++ * Autotuning will increase it further later on. ++ */ ++ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, ++ sock_net(meta_sk)->ipv4.sysctl_tcp_rmem[2]); ++ if (space > meta_sk->sk_rcvbuf) { ++ meta_tp->window_clamp += tp->window_clamp; ++ meta_tp->rcv_ssthresh += tp->rcv_ssthresh; ++ meta_sk->sk_rcvbuf = space; ++ } ++ ++snd_buf: ++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) ++ return; ++ ++ /* Adding a new subflow to the send-buffer space. We make a simple ++ * addition, to give some space to allow traffic on the new subflow. ++ * Autotuning will increase it further later on. ++ */ ++ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, ++ sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]); ++ if (space > meta_sk->sk_sndbuf) { ++ meta_sk->sk_sndbuf = space; ++ meta_sk->sk_write_space(meta_sk); ++ } ++} ++ ++struct lock_class_key meta_key; ++char *meta_key_name = "sk_lock-AF_INET-MPTCP"; ++struct lock_class_key meta_slock_key; ++char *meta_slock_key_name = "slock-AF_INET-MPTCP"; ++ ++static const struct tcp_sock_ops mptcp_meta_specific = { ++ .__select_window = __mptcp_select_window, ++ .select_window = mptcp_select_window, ++ .select_initial_window = mptcp_select_initial_window, ++ .init_buffer_space = mptcp_init_buffer_space, ++ .set_rto = mptcp_tcp_set_rto, ++ .should_expand_sndbuf = mptcp_should_expand_sndbuf, ++ .send_fin = mptcp_send_fin, ++ .write_xmit = mptcp_write_xmit, ++ .send_active_reset = mptcp_send_active_reset, ++ .write_wakeup = mptcp_write_wakeup, ++ .retransmit_timer = mptcp_meta_retransmit_timer, ++ .time_wait = mptcp_time_wait, ++ .cleanup_rbuf = mptcp_cleanup_rbuf, ++ .set_cong_ctrl = mptcp_set_congestion_control, ++}; ++ ++static const struct tcp_sock_ops mptcp_sub_specific = { ++ .__select_window = __mptcp_select_window, ++ .select_window = mptcp_select_window, ++ .select_initial_window = mptcp_select_initial_window, ++ .init_buffer_space = mptcp_init_buffer_space, ++ .set_rto = mptcp_tcp_set_rto, ++ .should_expand_sndbuf = mptcp_should_expand_sndbuf, ++ .send_fin = tcp_send_fin, ++ .write_xmit = tcp_write_xmit, ++ .send_active_reset = tcp_send_active_reset, ++ .write_wakeup = tcp_write_wakeup, ++ .retransmit_timer = mptcp_sub_retransmit_timer, ++ .time_wait = tcp_time_wait, ++ .cleanup_rbuf = tcp_cleanup_rbuf, ++ .set_cong_ctrl = __tcp_set_congestion_control, ++}; ++ ++void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb, ++ __u64 remote_key) ++{ ++ u64 idsn; ++ ++ mpcb->mptcp_rem_key = remote_key; ++ mpcb->rem_key_set = 1; ++ mptcp_key_hash(mpcb->mptcp_ver, mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn); ++ ++ idsn++; ++ mpcb->rcv_high_order[0] = idsn >> 32; ++ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; ++ meta_tp->copied_seq = (u32)idsn; ++ meta_tp->rcv_nxt = (u32)idsn; ++ meta_tp->rcv_wup = (u32)idsn; ++ meta_tp->rcv_right_edge = meta_tp->rcv_wup + meta_tp->rcv_wnd; ++ ++ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; ++} ++ ++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, ++ int rem_key_set, __u8 mptcp_ver, u32 window) ++{ ++ struct mptcp_cb *mpcb; ++ struct sock *master_sk; ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); ++ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); ++ u64 snd_idsn; ++ ++ dst_release(meta_sk->sk_rx_dst); ++ meta_sk->sk_rx_dst = NULL; ++ /* This flag is set to announce sock_lock_init to ++ * reclassify the lock-class of the master socket. ++ */ ++ meta_tp->is_master_sk = 1; ++ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO); ++ meta_tp->is_master_sk = 0; ++ if (!master_sk) ++ goto err_alloc_master; ++ ++ /* Same as in inet_csk_clone_lock - need to init to 0 */ ++ memset(&inet_csk(master_sk)->icsk_accept_queue, 0, ++ sizeof(inet_csk(master_sk)->icsk_accept_queue)); ++ ++ master_tp = tcp_sk(master_sk); ++ master_tp->inside_tk_table = 0; ++ ++ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC); ++ if (!mpcb) ++ goto err_alloc_mpcb; ++ ++ /* Store the mptcp version agreed on initial handshake */ ++ mpcb->mptcp_ver = mptcp_ver; ++ ++ /* Store the keys and generate the peer's token */ ++ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key; ++ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; ++ ++ /* Generate Initial data-sequence-numbers */ ++ mptcp_key_hash(mpcb->mptcp_ver, mpcb->mptcp_loc_key, NULL, &snd_idsn); ++ snd_idsn++; ++ mpcb->snd_high_order[0] = snd_idsn >> 32; ++ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; ++ ++ mpcb->meta_sk = meta_sk; ++ mpcb->master_sk = master_sk; ++ ++ skb_queue_head_init(&mpcb->reinject_queue); ++ mutex_init(&mpcb->mpcb_mutex); ++ ++ /* Init time-wait stuff */ ++ INIT_LIST_HEAD(&mpcb->tw_list); ++ ++ INIT_HLIST_HEAD(&mpcb->callback_list); ++ INIT_HLIST_HEAD(&mpcb->conn_list); ++ spin_lock_init(&mpcb->mpcb_list_lock); ++ ++ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf; ++ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf; ++ mpcb->orig_window_clamp = meta_tp->window_clamp; ++ ++ /* The meta is directly linked - set refcnt to 1 */ ++ refcount_set(&mpcb->mpcb_refcnt, 1); ++ ++ if (!meta_tp->inside_tk_table) { ++ /* Adding the meta_tp in the token hashtable - coming from server-side */ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ ++ /* With lockless listeners, we might process two ACKs at the ++ * same time. With TCP, inet_csk_complete_hashdance takes care ++ * of this. But, for MPTCP this would be too late if we add ++ * this MPTCP-socket in the token table (new subflows might ++ * come in and match on this socket here. ++ * So, we need to check if someone else already added the token ++ * and revert in that case. The other guy won the race... ++ */ ++ if (mptcp_find_token(mpcb->mptcp_loc_token)) { ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ goto err_insert_token; ++ } ++ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token); ++ ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++ } ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) { ++ struct tcp6_sock *master_tp6 = (struct tcp6_sock *)master_sk; ++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); ++ ++ inet_sk(master_sk)->pinet6 = &master_tp6->inet6; ++ ++ newnp = inet6_sk(master_sk); ++ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); ++ ++ newnp->ipv6_mc_list = NULL; ++ newnp->ipv6_ac_list = NULL; ++ newnp->ipv6_fl_list = NULL; ++ newnp->pktoptions = NULL; ++ newnp->opt = NULL; ++ ++ newnp->rxopt.all = 0; ++ newnp->repflow = 0; ++ np->rxopt.all = 0; ++ np->repflow = 0; ++ } else if (meta_sk->sk_family == AF_INET6) { ++ struct tcp6_sock *master_tp6 = (struct tcp6_sock *)master_sk; ++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); ++ struct ipv6_txoptions *opt; ++ ++ inet_sk(master_sk)->pinet6 = &master_tp6->inet6; ++ ++ /* The following heavily inspired from tcp_v6_syn_recv_sock() */ ++ newnp = inet6_sk(master_sk); ++ memcpy(newnp, np, sizeof(struct ipv6_pinfo)); ++ ++ newnp->ipv6_mc_list = NULL; ++ newnp->ipv6_ac_list = NULL; ++ newnp->ipv6_fl_list = NULL; ++ newnp->pktoptions = NULL; ++ newnp->opt = NULL; ++ ++ newnp->rxopt.all = 0; ++ newnp->repflow = 0; ++ np->rxopt.all = 0; ++ np->repflow = 0; ++ ++ opt = rcu_dereference(np->opt); ++ if (opt) { ++ opt = ipv6_dup_options(master_sk, opt); ++ RCU_INIT_POINTER(newnp->opt, opt); ++ } ++ inet_csk(master_sk)->icsk_ext_hdr_len = 0; ++ if (opt) ++ inet_csk(master_sk)->icsk_ext_hdr_len = opt->opt_nflen + ++ opt->opt_flen; ++ } ++#endif ++ ++ meta_tp->mptcp = NULL; ++ ++ meta_tp->write_seq = (u32)snd_idsn; ++ meta_tp->snd_sml = meta_tp->write_seq; ++ meta_tp->snd_una = meta_tp->write_seq; ++ meta_tp->snd_nxt = meta_tp->write_seq; ++ meta_tp->pushed_seq = meta_tp->write_seq; ++ meta_tp->snd_up = meta_tp->write_seq; ++ ++ if (rem_key_set) ++ mptcp_initialize_recv_vars(meta_tp, mpcb, remote_key); ++ ++ meta_tp->snd_wnd = window; ++ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ ++ ++ meta_tp->packets_out = 0; ++ meta_icsk->icsk_probes_out = 0; ++ ++ rcu_assign_pointer(inet_sk(meta_sk)->inet_opt, NULL); ++ ++ /* Set mptcp-pointers */ ++ master_tp->mpcb = mpcb; ++ master_tp->meta_sk = meta_sk; ++ meta_tp->mpcb = mpcb; ++ meta_tp->meta_sk = meta_sk; ++ ++ /* Initialize the queues */ ++ master_tp->out_of_order_queue = RB_ROOT; ++ master_sk->tcp_rtx_queue = RB_ROOT; ++ INIT_LIST_HEAD(&master_tp->tsq_node); ++ INIT_LIST_HEAD(&master_tp->tsorted_sent_queue); ++ ++ master_tp->fastopen_req = NULL; ++ ++ master_sk->sk_tsq_flags = 0; ++ /* icsk_bind_hash inherited from the meta, but it will be properly set in ++ * mptcp_create_master_sk. Same operation is done in inet_csk_clone_lock. ++ */ ++ inet_csk(master_sk)->icsk_bind_hash = NULL; ++ ++ /* Init the accept_queue structure, we support a queue of 32 pending ++ * connections, it does not need to be huge, since we only store here ++ * pending subflow creations. ++ */ ++ reqsk_queue_alloc(&meta_icsk->icsk_accept_queue); ++ meta_sk->sk_max_ack_backlog = 32; ++ meta_sk->sk_ack_backlog = 0; ++ ++ if (!sock_flag(meta_sk, SOCK_MPTCP)) { ++ mptcp_enable_static_key_bh(); ++ sock_set_flag(meta_sk, SOCK_MPTCP); ++ } ++ ++ /* Redefine function-pointers as the meta-sk is now fully ready */ ++ meta_tp->mpc = 1; ++ meta_tp->ops = &mptcp_meta_specific; ++ ++ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv; ++ meta_sk->sk_destruct = mptcp_sock_destruct; ++ ++ /* Meta-level retransmit timer */ ++ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */ ++ ++ tcp_init_xmit_timers(master_sk); ++ /* Has been set for sending out the SYN */ ++ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS); ++ ++ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk); ++ ++ mptcp_init_path_manager(mpcb); ++ mptcp_init_scheduler(mpcb); ++ ++ if (!try_module_get(inet_csk(master_sk)->icsk_ca_ops->owner)) ++ tcp_assign_congestion_control(master_sk); ++ ++ master_tp->saved_syn = NULL; ++ ++ mptcp_debug("%s: created mpcb with token %#x\n", ++ __func__, mpcb->mptcp_loc_token); ++ ++ return 0; ++ ++err_insert_token: ++ kmem_cache_free(mptcp_cb_cache, mpcb); ++ ++err_alloc_mpcb: ++ inet_sk(master_sk)->inet_opt = NULL; ++ master_sk->sk_state = TCP_CLOSE; ++ sock_orphan(master_sk); ++ bh_unlock_sock(master_sk); ++ sk_free(master_sk); ++ ++err_alloc_master: ++ return -ENOBUFS; ++} ++ ++/* Called without holding lock on mpcb */ ++static u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb) ++{ ++ int i; ++ ++ /* Start at 1, because 0 is reserved for the meta-sk */ ++ for (i = 1; i < sizeof(mpcb->path_index_bits) * 8; i++) { ++ if (!test_and_set_bit(i, &mpcb->path_index_bits)) ++ break; ++ } ++ ++ if (i == sizeof(mpcb->path_index_bits) * 8) ++ return 0; ++ return i; ++} ++ ++/* May be called without holding the meta-level lock */ ++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, ++ gfp_t flags) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags); ++ if (!tp->mptcp) ++ return -ENOMEM; ++ ++ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); ++ /* No more space for more subflows? */ ++ if (!tp->mptcp->path_index) { ++ kmem_cache_free(mptcp_sock_cache, tp->mptcp); ++ return -EPERM; ++ } ++ ++ INIT_HLIST_NODE(&tp->mptcp->cb_list); ++ ++ tp->mptcp->tp = tp; ++ tp->mpcb = mpcb; ++ tp->meta_sk = meta_sk; ++ ++ if (!sock_flag(sk, SOCK_MPTCP)) { ++ mptcp_enable_static_key_bh(); ++ sock_set_flag(sk, SOCK_MPTCP); ++ } ++ ++ tp->mpc = 1; ++ tp->ops = &mptcp_sub_specific; ++ ++ tp->mptcp->loc_id = loc_id; ++ tp->mptcp->rem_id = rem_id; ++ if (mpcb->sched_ops->init) ++ mpcb->sched_ops->init(sk); ++ ++ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be ++ * included in mptcp_del_sock(), because the mpcb must remain alive ++ * until the last subsocket is completely destroyed. ++ */ ++ sock_hold(meta_sk); ++ refcount_inc(&mpcb->mpcb_refcnt); ++ ++ spin_lock_bh(&mpcb->mpcb_list_lock); ++ hlist_add_head_rcu(&tp->mptcp->node, &mpcb->conn_list); ++ spin_unlock_bh(&mpcb->mpcb_list_lock); ++ ++ tp->mptcp->attached = 1; ++ ++ mptcp_sub_inherit_sockopts(meta_sk, sk); ++ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq); ++ ++ /* Properly inherit CC from the meta-socket */ ++ mptcp_assign_congestion_control(sk); ++ ++ /* As we successfully allocated the mptcp_tcp_sock, we have to ++ * change the function-pointers here (for sk_destruct to work correctly) ++ */ ++ sk->sk_error_report = mptcp_sock_def_error_report; ++ sk->sk_data_ready = mptcp_data_ready; ++ sk->sk_write_space = mptcp_write_space; ++ sk->sk_state_change = mptcp_set_state; ++ sk->sk_destruct = mptcp_sock_destruct; ++ ++ if (sk->sk_family == AF_INET) ++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d\n", ++ __func__ , mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, ++ &((struct inet_sock *)tp)->inet_saddr, ++ ntohs(((struct inet_sock *)tp)->inet_sport), ++ &((struct inet_sock *)tp)->inet_daddr, ++ ntohs(((struct inet_sock *)tp)->inet_dport)); ++#if IS_ENABLED(CONFIG_IPV6) ++ else ++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d\n", ++ __func__ , mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, &inet6_sk(sk)->saddr, ++ ntohs(((struct inet_sock *)tp)->inet_sport), ++ &sk->sk_v6_daddr, ++ ntohs(((struct inet_sock *)tp)->inet_dport)); ++#endif ++ ++ return 0; ++} ++ ++void mptcp_del_sock(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_cb *mpcb; ++ ++ if (!tp->mptcp || !tp->mptcp->attached) ++ return; ++ ++ mpcb = tp->mpcb; ++ ++ if (mpcb->sched_ops->release) ++ mpcb->sched_ops->release(sk); ++ ++ if (mpcb->pm_ops->delete_subflow) ++ mpcb->pm_ops->delete_subflow(sk); ++ ++ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n", ++ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, ++ sk->sk_state, is_meta_sk(sk)); ++ ++ spin_lock_bh(&mpcb->mpcb_list_lock); ++ hlist_del_init_rcu(&tp->mptcp->node); ++ spin_unlock_bh(&mpcb->mpcb_list_lock); ++ ++ tp->mptcp->attached = 0; ++ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index); ++ ++ if (!tcp_write_queue_empty(sk) || !tcp_rtx_queue_empty(sk)) ++ mptcp_reinject_data(sk, 0); ++ ++ if (is_master_tp(tp)) { ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ ++ if (meta_tp->record_master_info && ++ !sock_flag(meta_sk, SOCK_DEAD)) { ++ mpcb->master_info = kmalloc(sizeof(*mpcb->master_info), ++ GFP_ATOMIC); ++ ++ if (mpcb->master_info) ++ tcp_get_info(sk, mpcb->master_info, true); ++ } ++ ++ mpcb->master_sk = NULL; ++ } else if (tp->mptcp->pre_established) { ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); ++ } ++} ++ ++/* Updates the MPTCP-session based on path-manager information (e.g., addresses, ++ * low-prio flows,...). ++ */ ++void mptcp_update_metasocket(const struct sock *meta_sk) ++{ ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->new_session) ++ tcp_sk(meta_sk)->mpcb->pm_ops->new_session(meta_sk); ++} ++ ++/* Clean up the receive buffer for full frames taken by the user, ++ * then send an ACK if necessary. COPIED is the number of bytes ++ * tcp_recvmsg has given to the user so far, it speeds up the ++ * calculation of whether or not we must ACK for the sake of ++ * a window update. ++ * (inspired from tcp_cleanup_rbuf()) ++ */ ++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ bool recheck_rcv_window = false; ++ struct mptcp_tcp_sock *mptcp; ++ __u32 rcv_window_now = 0; ++ ++ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) { ++ rcv_window_now = tcp_receive_window_now(meta_tp); ++ ++ /* Optimize, __mptcp_select_window() is not cheap. */ ++ if (2 * rcv_window_now <= meta_tp->window_clamp) ++ recheck_rcv_window = true; ++ } ++ ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ if (!mptcp_sk_can_send_ack(sk)) ++ continue; ++ ++ if (!inet_csk_ack_scheduled(sk)) ++ goto second_part; ++ /* Delayed ACKs frequently hit locked sockets during bulk ++ * receive. ++ */ ++ if (icsk->icsk_ack.blocked || ++ /* Once-per-two-segments ACK was not sent by tcp_input.c */ ++ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || ++ /* If this read emptied read buffer, we send ACK, if ++ * connection is not bidirectional, user drained ++ * receive buffer and there was a small segment ++ * in queue. ++ */ ++ (copied > 0 && ++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && ++ !icsk->icsk_ack.pingpong)) && ++ !atomic_read(&meta_sk->sk_rmem_alloc))) { ++ tcp_send_ack(sk); ++ continue; ++ } ++ ++second_part: ++ /* This here is the second part of tcp_cleanup_rbuf */ ++ if (recheck_rcv_window) { ++ __u32 new_window = tp->ops->__select_window(sk); ++ ++ /* Send ACK now, if this read freed lots of space ++ * in our buffer. Certainly, new_window is new window. ++ * We can advertise it now, if it is not less than ++ * current one. ++ * "Lots" means "at least twice" here. ++ */ ++ if (new_window && new_window >= 2 * rcv_window_now) ++ tcp_send_ack(sk); ++ } ++ } ++} ++ ++static int mptcp_sub_send_fin(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sk_buff *skb = tcp_write_queue_tail(sk); ++ int mss_now; ++ ++ /* Optimization, tack on the FIN if we have a queue of ++ * unsent frames. But be careful about outgoing SACKS ++ * and IP options. ++ */ ++ mss_now = tcp_current_mss(sk); ++ ++ if (tcp_send_head(sk) != NULL) { ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; ++ TCP_SKB_CB(skb)->end_seq++; ++ tp->write_seq++; ++ } else { ++ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC); ++ if (!skb) ++ return 1; ++ ++ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); ++ skb_reserve(skb, MAX_TCP_HEADER); ++ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ ++ tcp_init_nondata_skb(skb, tp->write_seq, ++ TCPHDR_ACK | TCPHDR_FIN); ++ tcp_queue_skb(sk, skb); ++ } ++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); ++ ++ return 0; ++} ++ ++static void mptcp_sub_close_doit(struct sock *sk) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (sock_flag(sk, SOCK_DEAD)) ++ return; ++ ++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) { ++ tp->closing = 1; ++ tcp_close(sk, 0); ++ } else if (tcp_close_state(sk)) { ++ sk->sk_shutdown |= SEND_SHUTDOWN; ++ tcp_send_fin(sk); ++ } ++} ++ ++void mptcp_sub_close_wq(struct work_struct *work) ++{ ++ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp; ++ struct sock *sk = (struct sock *)tp; ++ struct mptcp_cb *mpcb = tp->mpcb; ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ mptcp_sub_close_doit(sk); ++ ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ sock_put(sk); ++} ++ ++void mptcp_sub_close(struct sock *sk, unsigned long delay) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct delayed_work *work = &tcp_sk(sk)->mptcp->work; ++ ++ /* We are already closing - e.g., call from sock_def_error_report upon ++ * tcp_disconnect in tcp_close. ++ */ ++ if (tp->closing) ++ return; ++ ++ /* Work already scheduled ? */ ++ if (work_pending(&work->work)) { ++ /* Work present - who will be first ? */ ++ if (jiffies + delay > work->timer.expires) ++ return; ++ ++ /* Try canceling - if it fails, work will be executed soon */ ++ if (!cancel_delayed_work(work)) ++ return; ++ sock_put(sk); ++ mptcp_mpcb_put(tp->mpcb); ++ } ++ ++ if (!delay) { ++ unsigned char old_state = sk->sk_state; ++ ++ /* We directly send the FIN. Because it may take so a long time, ++ * untile the work-queue will get scheduled... ++ * ++ * If mptcp_sub_send_fin returns 1, it failed and thus we reset ++ * the old state so that tcp_close will finally send the fin ++ * in user-context. ++ */ ++ if (!sk->sk_err && old_state != TCP_CLOSE && ++ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) { ++ if (old_state == TCP_ESTABLISHED) ++ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); ++ sk->sk_state = old_state; ++ } ++ } ++ ++ sock_hold(sk); ++ refcount_inc(&tp->mpcb->mpcb_refcnt); ++ queue_delayed_work(mptcp_wq, work, delay); ++} ++ ++void mptcp_sub_force_close(struct sock *sk) ++{ ++ /* The below tcp_done may have freed the socket, if he is already dead. ++ * Thus, we are not allowed to access it afterwards. That's why ++ * we have to store the dead-state in this local variable. ++ */ ++ int sock_is_dead = sock_flag(sk, SOCK_DEAD); ++ ++ tcp_sk(sk)->mp_killed = 1; ++ ++ if (sk->sk_state != TCP_CLOSE) ++ tcp_done(sk); ++ ++ if (!sock_is_dead) ++ mptcp_sub_close(sk, 0); ++} ++EXPORT_SYMBOL(mptcp_sub_force_close); ++ ++/* Update the mpcb send window, based on the contributions ++ * of each subflow ++ */ ++void mptcp_update_sndbuf(const struct tcp_sock *tp) ++{ ++ struct sock *meta_sk = tp->meta_sk; ++ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf; ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if (!mptcp_sk_can_send(sk)) ++ continue; ++ ++ new_sndbuf += sk->sk_sndbuf; ++ ++ if (new_sndbuf > sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2] || ++ new_sndbuf < 0) { ++ new_sndbuf = sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]; ++ break; ++ } ++ } ++ meta_sk->sk_sndbuf = max(min(new_sndbuf, ++ sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]), ++ meta_sk->sk_sndbuf); ++ ++ /* The subflow's call to sk_write_space in tcp_new_space ends up in ++ * mptcp_write_space. ++ * It has nothing to do with waking up the application. ++ * So, we do it here. ++ */ ++ if (old_sndbuf != meta_sk->sk_sndbuf) ++ meta_sk->sk_write_space(meta_sk); ++} ++ ++/* Similar to: tcp_close */ ++void mptcp_close(struct sock *meta_sk, long timeout) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct mptcp_tcp_sock *mptcp; ++ struct sk_buff *skb; ++ int data_was_unread = 0; ++ int state; ++ ++ mptcp_debug("%s: Close of meta_sk with tok %#x\n", ++ __func__, mpcb->mptcp_loc_token); ++ ++ WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0); ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ if (meta_tp->inside_tk_table) ++ /* Detach the mpcb from the token hashtable */ ++ mptcp_hash_remove_bh(meta_tp); ++ ++ meta_sk->sk_shutdown = SHUTDOWN_MASK; ++ /* We need to flush the recv. buffs. We do this only on the ++ * descriptor close, not protocol-sourced closes, because the ++ * reader process may not have drained the data yet! ++ */ ++ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) { ++ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; ++ ++ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ++ len--; ++ data_was_unread += len; ++ __kfree_skb(skb); ++ } ++ ++ sk_mem_reclaim(meta_sk); ++ ++ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ ++ if (meta_sk->sk_state == TCP_CLOSE) { ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(sk_it)->send_mp_fclose) ++ continue; ++ mptcp_sub_close(sk_it, 0); ++ } ++ goto adjudge_to_death; ++ } ++ ++ if (data_was_unread) { ++ /* Unread data was tossed, zap the connection. */ ++ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE); ++ tcp_set_state(meta_sk, TCP_CLOSE); ++ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk, ++ meta_sk->sk_allocation); ++ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) { ++ /* Check zero linger _after_ checking for unread data. */ ++ meta_sk->sk_prot->disconnect(meta_sk, 0); ++ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); ++ } else if (tcp_close_state(meta_sk)) { ++ mptcp_send_fin(meta_sk); ++ } else if (meta_tp->snd_una == meta_tp->write_seq) { ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ /* The DATA_FIN has been sent and acknowledged ++ * (e.g., by sk_shutdown). Close all the other subflows ++ */ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ unsigned long delay = 0; ++ /* If we are the passive closer, don't trigger ++ * subflow-fin until the subflow has been finned ++ * by the peer. - thus we add a delay ++ */ ++ if (mpcb->passive_close && ++ sk_it->sk_state == TCP_ESTABLISHED) ++ delay = inet_csk(sk_it)->icsk_rto << 3; ++ ++ mptcp_sub_close(sk_it, delay); ++ } ++ } ++ ++ sk_stream_wait_close(meta_sk, timeout); ++ ++adjudge_to_death: ++ state = meta_sk->sk_state; ++ sock_hold(meta_sk); ++ sock_orphan(meta_sk); ++ ++ /* socket will be freed after mptcp_close - we have to prevent ++ * access from the subflows. ++ */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ /* Similar to sock_orphan, but we don't set it DEAD, because ++ * the callbacks are still set and must be called. ++ */ ++ write_lock_bh(&sk_it->sk_callback_lock); ++ sk_set_socket(sk_it, NULL); ++ sk_it->sk_wq = NULL; ++ write_unlock_bh(&sk_it->sk_callback_lock); ++ } ++ ++ if (mpcb->pm_ops->close_session) ++ mpcb->pm_ops->close_session(meta_sk); ++ ++ /* It is the last release_sock in its life. It will remove backlog. */ ++ release_sock(meta_sk); ++ ++ /* Now socket is owned by kernel and we acquire BH lock ++ * to finish close. No need to check for user refs. ++ */ ++ local_bh_disable(); ++ bh_lock_sock(meta_sk); ++ WARN_ON(sock_owned_by_user(meta_sk)); ++ ++ percpu_counter_inc(meta_sk->sk_prot->orphan_count); ++ ++ /* Have we already been destroyed by a softirq or backlog? */ ++ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE) ++ goto out; ++ ++ /* This is a (useful) BSD violating of the RFC. There is a ++ * problem with TCP as specified in that the other end could ++ * keep a socket open forever with no application left this end. ++ * We use a 3 minute timeout (about the same as BSD) then kill ++ * our end. If they send after that then tough - BUT: long enough ++ * that we won't make the old 4*rto = almost no time - whoops ++ * reset mistake. ++ * ++ * Nope, it was not mistake. It is really desired behaviour ++ * f.e. on http servers, when such sockets are useless, but ++ * consume significant resources. Let's do it with special ++ * linger2 option. --ANK ++ */ ++ ++ if (meta_sk->sk_state == TCP_FIN_WAIT2) { ++ if (meta_tp->linger2 < 0) { ++ tcp_set_state(meta_sk, TCP_CLOSE); ++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); ++ __NET_INC_STATS(sock_net(meta_sk), ++ LINUX_MIB_TCPABORTONLINGER); ++ } else { ++ const int tmo = tcp_fin_time(meta_sk); ++ ++ if (tmo > TCP_TIMEWAIT_LEN) { ++ inet_csk_reset_keepalive_timer(meta_sk, ++ tmo - TCP_TIMEWAIT_LEN); ++ } else { ++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, ++ tmo); ++ goto out; ++ } ++ } ++ } ++ if (meta_sk->sk_state != TCP_CLOSE) { ++ sk_mem_reclaim(meta_sk); ++ if (tcp_check_oom(meta_sk, 0)) { ++ if (net_ratelimit()) ++ pr_info("MPTCP: out of memory: force closing socket\n"); ++ tcp_set_state(meta_sk, TCP_CLOSE); ++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); ++ __NET_INC_STATS(sock_net(meta_sk), ++ LINUX_MIB_TCPABORTONMEMORY); ++ } ++ } ++ ++ ++ if (meta_sk->sk_state == TCP_CLOSE) ++ inet_csk_destroy_sock(meta_sk); ++ /* Otherwise, socket is reprieved until protocol close. */ ++ ++out: ++ bh_unlock_sock(meta_sk); ++ local_bh_enable(); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ sock_put(meta_sk); /* Taken by sock_hold */ ++} ++ ++void mptcp_disconnect(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ __skb_queue_purge(&meta_tp->mpcb->reinject_queue); ++ ++ if (meta_tp->inside_tk_table) ++ mptcp_hash_remove_bh(meta_tp); ++ ++ local_bh_disable(); ++ mptcp_for_each_sub_safe(meta_tp->mpcb, mptcp, tmp) { ++ struct sock *subsk = mptcp_to_sock(mptcp); ++ ++ if (spin_is_locked(&subsk->sk_lock.slock)) ++ bh_unlock_sock(subsk); ++ ++ tcp_sk(subsk)->tcp_disconnect = 1; ++ ++ meta_sk->sk_prot->disconnect(subsk, O_NONBLOCK); ++ ++ sock_orphan(subsk); ++ ++ percpu_counter_inc(meta_sk->sk_prot->orphan_count); ++ ++ inet_csk_destroy_sock(subsk); ++ } ++ local_bh_enable(); ++ ++ mptcp_mpcb_cleanup(meta_tp->mpcb); ++ meta_tp->meta_sk = NULL; ++ ++ meta_tp->send_mp_fclose = 0; ++ meta_tp->mpc = 0; ++ meta_tp->ops = &tcp_specific; ++#if IS_ENABLED(CONFIG_IPV6) ++ if (meta_sk->sk_family == AF_INET6) ++ meta_sk->sk_backlog_rcv = tcp_v6_do_rcv; ++ else ++ meta_sk->sk_backlog_rcv = tcp_v4_do_rcv; ++#else ++ meta_sk->sk_backlog_rcv = tcp_v4_do_rcv; ++#endif ++ meta_sk->sk_destruct = inet_sock_destruct; ++} ++ ++ ++/* Returns True if we should enable MPTCP for that socket. */ ++bool mptcp_doit(struct sock *sk) ++{ ++ const struct dst_entry *dst = __sk_dst_get(sk); ++ ++ /* Don't do mptcp over loopback */ ++ if (sk->sk_family == AF_INET && ++ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) || ++ ipv4_is_loopback(inet_sk(sk)->inet_saddr))) ++ return false; ++#if IS_ENABLED(CONFIG_IPV6) ++ if (sk->sk_family == AF_INET6 && ++ (ipv6_addr_loopback(&sk->sk_v6_daddr) || ++ ipv6_addr_loopback(&inet6_sk(sk)->saddr))) ++ return false; ++#endif ++ if (mptcp_v6_is_v4_mapped(sk) && ++ ipv4_is_loopback(inet_sk(sk)->inet_saddr)) ++ return false; ++ ++#ifdef CONFIG_TCP_MD5SIG ++ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */ ++ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk)) ++ return false; ++#endif ++ ++ if (dst->dev && (dst->dev->flags & IFF_NOMULTIPATH)) ++ return false; ++ ++ return true; ++} ++ ++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, ++ int rem_key_set, __u8 mptcp_ver, u32 window) ++{ ++ struct tcp_sock *master_tp; ++ struct sock *master_sk; ++ ++ if (mptcp_alloc_mpcb(meta_sk, remote_key, rem_key_set, mptcp_ver, window)) ++ goto err_alloc_mpcb; ++ ++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; ++ master_tp = tcp_sk(master_sk); ++ ++ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC)) ++ goto err_add_sock; ++ ++ if (__inet_inherit_port(meta_sk, master_sk) < 0) ++ goto err_add_sock; ++ ++ meta_sk->sk_prot->unhash(meta_sk); ++ inet_ehash_nolisten(master_sk, NULL); ++ ++ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd; ++ ++ return 0; ++ ++err_add_sock: ++ inet_csk_prepare_forced_close(master_sk); ++ tcp_done(master_sk); ++ ++err_alloc_mpcb: ++ return -ENOBUFS; ++} ++ ++static int __mptcp_check_req_master(struct sock *child, ++ const struct mptcp_options_received *mopt, ++ struct request_sock *req) ++{ ++ struct tcp_sock *child_tp = tcp_sk(child); ++ struct sock *meta_sk = child; ++ struct mptcp_cb *mpcb; ++ struct mptcp_request_sock *mtreq; ++ ++ /* Never contained an MP_CAPABLE */ ++ if (!inet_rsk(req)->mptcp_rqsk) ++ return 1; ++ ++ mtreq = mptcp_rsk(req); ++ ++ if (!inet_rsk(req)->saw_mpc) { ++ /* Fallback to regular TCP, because we saw one SYN without ++ * MP_CAPABLE. In tcp_check_req we continue the regular path. ++ * But, the socket has been added to the reqsk_tk_htb, so we ++ * must still remove it. ++ */ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); ++ mptcp_reqsk_remove_tk(req); ++ return 1; ++ } ++ ++ /* mopt can be NULL when coming from FAST-OPEN */ ++ if (mopt && mopt->saw_mpc && mtreq->mptcp_ver == MPTCP_VERSION_1) { ++ mtreq->mptcp_rem_key = mopt->mptcp_sender_key; ++ mtreq->rem_key_set = 1; ++ } ++ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); ++ ++ /* Just set this values to pass them to mptcp_alloc_mpcb */ ++ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; ++ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; ++ ++ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, ++ mtreq->rem_key_set, mtreq->mptcp_ver, ++ child_tp->snd_wnd)) { ++ inet_csk_prepare_forced_close(meta_sk); ++ tcp_done(meta_sk); ++ ++ return -ENOBUFS; ++ } ++ ++ child = tcp_sk(child)->mpcb->master_sk; ++ child_tp = tcp_sk(child); ++ mpcb = child_tp->mpcb; ++ ++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; ++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; ++ ++ mpcb->dss_csum = mtreq->dss_csum; ++ mpcb->server_side = 1; ++ ++ /* Needs to be done here additionally, because when accepting a ++ * new connection we pass by __reqsk_free and not reqsk_free. ++ */ ++ mptcp_reqsk_remove_tk(req); ++ ++ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */ ++ sock_put(meta_sk); ++ ++ return 0; ++} ++ ++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req) ++{ ++ struct sock *meta_sk = child, *master_sk; ++ struct sk_buff *skb; ++ u32 new_mapping; ++ int ret; ++ ++ ret = __mptcp_check_req_master(child, NULL, req); ++ if (ret) ++ return ret; ++ ++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk; ++ ++ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have ++ * pre-MPTCP data in the receive queue. ++ */ ++ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt - ++ tcp_rsk(req)->rcv_isn - 1; ++ ++ /* Map subflow sequence number to data sequence numbers. We need to map ++ * these data to [IDSN - len - 1, IDSN[. ++ */ ++ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1; ++ ++ /* There should be only one skb: the SYN + data. */ ++ skb_queue_walk(&meta_sk->sk_receive_queue, skb) { ++ TCP_SKB_CB(skb)->seq += new_mapping; ++ TCP_SKB_CB(skb)->end_seq += new_mapping; ++ } ++ ++ /* With fastopen we change the semantics of the relative subflow ++ * sequence numbers to deal with middleboxes that could add/remove ++ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1 ++ * instead of the regular TCP ISN. ++ */ ++ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1; ++ ++ /* We need to update copied_seq of the master_sk to account for the ++ * already moved data to the meta receive queue. ++ */ ++ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt; ++ ++ /* Handled by the master_sk */ ++ tcp_sk(meta_sk)->fastopen_rsk = NULL; ++ ++ return 0; ++} ++ ++int mptcp_check_req_master(struct sock *sk, struct sock *child, ++ struct request_sock *req, const struct sk_buff *skb, ++ const struct mptcp_options_received *mopt, ++ int drop, u32 tsoff) ++{ ++ struct sock *meta_sk = child; ++ int ret; ++ ++ ret = __mptcp_check_req_master(child, mopt, req); ++ if (ret) ++ return ret; ++ child = tcp_sk(child)->mpcb->master_sk; ++ ++ sock_rps_save_rxhash(child, skb); ++ ++ /* drop indicates that we come from tcp_check_req and thus need to ++ * handle the request-socket fully. ++ */ ++ if (drop) { ++ tcp_synack_rtt_meas(child, req); ++ ++ inet_csk_reqsk_queue_drop(sk, req); ++ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); ++ if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) { ++ bh_unlock_sock(meta_sk); ++ /* No sock_put() of the meta needed. The reference has ++ * already been dropped in __mptcp_check_req_master(). ++ */ ++ sock_put(child); ++ return -1; ++ } ++ } else { ++ /* Thus, we come from syn-cookies */ ++ refcount_set(&req->rsk_refcnt, 1); ++ tcp_sk(meta_sk)->tsoffset = tsoff; ++ if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) { ++ bh_unlock_sock(meta_sk); ++ /* No sock_put() of the meta needed. The reference has ++ * already been dropped in __mptcp_check_req_master(). ++ */ ++ sock_put(child); ++ reqsk_put(req); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++/* May be called without holding the meta-level lock */ ++struct sock *mptcp_check_req_child(struct sock *meta_sk, ++ struct sock *child, ++ struct request_sock *req, ++ struct sk_buff *skb, ++ const struct mptcp_options_received *mopt) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ struct tcp_sock *child_tp = tcp_sk(child); ++ u8 hash_mac_check[SHA256_DIGEST_SIZE]; ++ ++ if (!mopt->join_ack) { ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKFAIL); ++ goto teardown; ++ } ++ ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, ++ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 2, ++ 4, (u8 *)&mtreq->mptcp_rem_nonce, ++ 4, (u8 *)&mtreq->mptcp_loc_nonce); ++ ++ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) { ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKMAC); ++ goto teardown; ++ } ++ ++ /* Point it to the same struct socket and wq as the meta_sk */ ++ sk_set_socket(child, meta_sk->sk_socket); ++ child->sk_wq = meta_sk->sk_wq; ++ ++ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) { ++ /* Has been inherited, but now child_tp->mptcp is NULL */ ++ child_tp->mpc = 0; ++ child_tp->ops = &tcp_specific; ++ ++ /* TODO when we support acking the third ack for new subflows, ++ * we should silently discard this third ack, by returning NULL. ++ * ++ * Maybe, at the retransmission we will have enough memory to ++ * fully add the socket to the meta-sk. ++ */ ++ goto teardown; ++ } ++ ++ /* The child is a clone of the meta socket, we must now reset ++ * some of the fields ++ */ ++ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio; ++ ++ /* We should allow proper increase of the snd/rcv-buffers. Thus, we ++ * use the original values instead of the bloated up ones from the ++ * clone. ++ */ ++ child->sk_sndbuf = mpcb->orig_sk_sndbuf; ++ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf; ++ ++ child_tp->mptcp->slave_sk = 1; ++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; ++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; ++ child_tp->mptcp->init_rcv_wnd = req->rsk_rcv_wnd; ++ ++ child->sk_tsq_flags = 0; ++ ++ child_tp->packets_out = 0; ++ ++ tcp_reset_vars(child); ++ ++ sock_rps_save_rxhash(child, skb); ++ tcp_synack_rtt_meas(child, req); ++ ++ if (mpcb->pm_ops->established_subflow) ++ mpcb->pm_ops->established_subflow(child); ++ ++ /* Subflows do not use the accept queue, as they ++ * are attached immediately to the mpcb. ++ */ ++ inet_csk_reqsk_queue_drop(meta_sk, req); ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); ++ ++ /* The refcnt is initialized to 2, because regular TCP will put him ++ * in the socket's listener queue. However, we do not have a listener-queue. ++ * So, we need to make sure that this request-sock indeed gets destroyed. ++ */ ++ reqsk_put(req); ++ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKRX); ++ ++ if (inet_sk(child)->inet_sport != inet_sk(meta_sk)->inet_sport) ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINALTERNATEPORT); ++ ++ return child; ++ ++teardown: ++ req->rsk_ops->send_reset(meta_sk, skb); ++ ++ /* Drop this request - sock creation failed. */ ++ inet_csk_reqsk_queue_drop(meta_sk, req); ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); ++ inet_csk_prepare_forced_close(child); ++ tcp_done(child); ++ bh_unlock_sock(meta_sk); ++ return meta_sk; ++} ++ ++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw) ++{ ++ struct mptcp_tw *mptw; ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ ++ /* A subsocket in tw can only receive data. So, if we are in ++ * infinite-receive, then we should not reply with a data-ack or act ++ * upon general MPTCP-signaling. We prevent this by simply not creating ++ * the mptcp_tw_sock. ++ */ ++ if (mpcb->infinite_mapping_rcv) { ++ tw->mptcp_tw = NULL; ++ return 0; ++ } ++ ++ /* Alloc MPTCP-tw-sock */ ++ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC); ++ if (!mptw) { ++ tw->mptcp_tw = NULL; ++ return -ENOBUFS; ++ } ++ ++ refcount_inc(&mpcb->mpcb_refcnt); ++ ++ tw->mptcp_tw = mptw; ++ mptw->loc_key = mpcb->mptcp_loc_key; ++ mptw->meta_tw = mpcb->in_time_wait; ++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); ++ if (mptw->meta_tw && mpcb->mptw_state != TCP_TIME_WAIT) ++ mptw->rcv_nxt++; ++ rcu_assign_pointer(mptw->mpcb, mpcb); ++ ++ spin_lock_bh(&mpcb->mpcb_list_lock); ++ list_add_rcu(&mptw->list, &tp->mpcb->tw_list); ++ mptw->in_list = 1; ++ spin_unlock_bh(&mpcb->mpcb_list_lock); ++ ++ return 0; ++} ++ ++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) ++{ ++ struct mptcp_cb *mpcb; ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++ mpcb = rcu_dereference(tw->mptcp_tw->mpcb); ++ ++ /* If we are still holding a ref to the mpcb, we have to remove ourself ++ * from the list and drop the ref properly. ++ */ ++ if (mpcb && refcount_inc_not_zero(&mpcb->mpcb_refcnt)) { ++ spin_lock(&mpcb->mpcb_list_lock); ++ if (tw->mptcp_tw->in_list) { ++ list_del_rcu(&tw->mptcp_tw->list); ++ tw->mptcp_tw->in_list = 0; ++ /* Put, because we added it to the list */ ++ mptcp_mpcb_put(mpcb); ++ } ++ spin_unlock(&mpcb->mpcb_list_lock); ++ ++ /* Second time, because we increased it above */ ++ mptcp_mpcb_put(mpcb); ++ } ++ ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw); ++} ++ ++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a ++ * data-fin. ++ */ ++void mptcp_time_wait(struct sock *meta_sk, int state, int timeo) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_tw *mptw; ++ ++ if (mptcp_in_infinite_mapping_weak(meta_tp->mpcb)) { ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ mptcp_for_each_sub_safe(meta_tp->mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (sk_it->sk_state == TCP_CLOSE) ++ continue; ++ ++ tcp_sk(sk_it)->ops->time_wait(sk_it, state, timeo); ++ } ++ } ++ ++ /* Used for sockets that go into tw after the meta ++ * (see mptcp_init_tw_sock()) ++ */ ++ meta_tp->mpcb->in_time_wait = 1; ++ meta_tp->mpcb->mptw_state = state; ++ ++ /* Update the time-wait-sock's information */ ++ rcu_read_lock(); ++ local_bh_disable(); ++ list_for_each_entry_rcu(mptw, &meta_tp->mpcb->tw_list, list) { ++ mptw->meta_tw = 1; ++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(meta_tp); ++ ++ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 - ++ * pretend as if the DATA_FIN has already reached us, that way ++ * the checks in tcp_timewait_state_process will be good as the ++ * DATA_FIN comes in. ++ */ ++ if (state != TCP_TIME_WAIT) ++ mptw->rcv_nxt++; ++ } ++ local_bh_enable(); ++ rcu_read_unlock(); ++ ++ if (meta_sk->sk_state != TCP_CLOSE) ++ tcp_done(meta_sk); ++} ++ ++void mptcp_tsq_flags(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ /* It will be handled as a regular deferred-call */ ++ if (is_meta_sk(sk)) ++ return; ++ ++ if (hlist_unhashed(&tp->mptcp->cb_list)) { ++ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list); ++ /* We need to hold it here, as the sock_hold is not assured ++ * by the release_sock as it is done in regular TCP. ++ * ++ * The subsocket may get inet_csk_destroy'd while it is inside ++ * the callback_list. ++ */ ++ sock_hold(sk); ++ } ++ ++ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &meta_sk->sk_tsq_flags)) ++ sock_hold(meta_sk); ++} ++ ++void mptcp_tsq_sub_deferred(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ __sock_put(meta_sk); ++ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) { ++ struct tcp_sock *tp = mptcp->tp; ++ struct sock *sk = (struct sock *)tp; ++ ++ hlist_del_init(&mptcp->cb_list); ++ sk->sk_prot->release_cb(sk); ++ /* Final sock_put (cfr. mptcp_tsq_flags) */ ++ sock_put(sk); ++ } ++} ++ ++/* May be called without holding the meta-level lock */ ++void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb, ++ const struct request_sock *req, ++ struct sk_buff *skb) ++{ ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ u8 mptcp_hash_mac[SHA256_DIGEST_SIZE]; ++ struct mptcp_options_received mopt; ++ ++ mptcp_init_mp_opt(&mopt); ++ tcp_parse_mptcp_options(skb, &mopt); ++ ++ mtreq->is_sub = 1; ++ inet_rsk(req)->mptcp_rqsk = 1; ++ ++ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; ++ ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, ++ (u8 *)&mpcb->mptcp_rem_key, mptcp_hash_mac, 2, ++ 4, (u8 *)&mtreq->mptcp_loc_nonce, ++ 4, (u8 *)&mtreq->mptcp_rem_nonce); ++ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; ++ ++ mtreq->rem_id = mopt.rem_id; ++ mtreq->rcv_low_prio = mopt.low_prio; ++ inet_rsk(req)->saw_mpc = 1; ++ ++ MPTCP_INC_STATS(sock_net(mpcb->meta_sk), MPTCP_MIB_JOINSYNRX); ++} ++ ++void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, bool want_cookie) ++{ ++ struct mptcp_options_received mopt; ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ ++ mptcp_init_mp_opt(&mopt); ++ tcp_parse_mptcp_options(skb, &mopt); ++ ++ mtreq->dss_csum = mopt.dss_csum; ++ ++ if (want_cookie) { ++ if (!mptcp_reqsk_new_cookie(req, sk, &mopt, skb)) ++ /* No key available - back to regular TCP */ ++ inet_rsk(req)->mptcp_rqsk = 0; ++ return; ++ } ++ ++ mptcp_reqsk_new_mptcp(req, sk, &mopt, skb); ++} ++ ++void mptcp_cookies_reqsk_init(struct request_sock *req, ++ struct mptcp_options_received *mopt, ++ struct sk_buff *skb) ++{ ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ ++ /* Absolutely need to always initialize this. */ ++ mtreq->hash_entry.pprev = NULL; ++ ++ mtreq->mptcp_ver = mopt->mptcp_ver; ++ mtreq->mptcp_rem_key = mopt->mptcp_sender_key; ++ mtreq->mptcp_loc_key = mopt->mptcp_receiver_key; ++ mtreq->rem_key_set = 1; ++ ++ /* Generate the token */ ++ mptcp_key_hash(mtreq->mptcp_ver, mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); ++ ++ rcu_read_lock(); ++ local_bh_disable(); ++ spin_lock(&mptcp_tk_hashlock); ++ ++ /* Check, if the key is still free */ ++ if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || ++ mptcp_find_token(mtreq->mptcp_loc_token)) ++ goto out; ++ ++ inet_rsk(req)->saw_mpc = 1; ++ mtreq->is_sub = 0; ++ inet_rsk(req)->mptcp_rqsk = 1; ++ mtreq->dss_csum = mopt->dss_csum; ++ ++out: ++ spin_unlock(&mptcp_tk_hashlock); ++ local_bh_enable(); ++ rcu_read_unlock(); ++} ++ ++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb) ++{ ++ struct mptcp_options_received mopt; ++ ++ mptcp_init_mp_opt(&mopt); ++ tcp_parse_mptcp_options(skb, &mopt); ++ ++ if (mopt.is_mp_join) ++ return mptcp_do_join_short(skb, &mopt, sock_net(sk)); ++ if (mopt.drop_me) ++ goto drop; ++ ++ if (!sock_flag(sk, SOCK_MPTCP)) ++ mopt.saw_mpc = 0; ++ ++ /* If the requested version is higher than what we support, fall back */ ++ if (mopt.saw_mpc && mopt.mptcp_ver > tcp_sk(sk)->mptcp_ver) ++ mopt.saw_mpc = 0; ++ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ if (mopt.saw_mpc) { ++ if (skb_rtable(skb)->rt_flags & ++ (RTCF_BROADCAST | RTCF_MULTICAST)) ++ goto drop; ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE); ++ return tcp_conn_request(&mptcp_request_sock_ops, ++ &mptcp_request_sock_ipv4_ops, ++ sk, skb); ++ } ++ ++ return tcp_v4_conn_request(sk, skb); ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ if (mopt.saw_mpc) { ++ if (!ipv6_unicast_destination(skb)) ++ goto drop; ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE); ++ return tcp_conn_request(&mptcp6_request_sock_ops, ++ &mptcp_request_sock_ipv6_ops, ++ sk, skb); ++ } ++ ++ return tcp_v6_conn_request(sk, skb); ++#endif ++ } ++drop: ++ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); ++ return 0; ++} ++ ++int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb) ++ __releases(&child->sk_lock.slock) ++{ ++ int ret; ++ ++ /* We don't call tcp_child_process here, because we hold ++ * already the meta-sk-lock and are sure that it is not owned ++ * by the user. ++ */ ++ tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ++ ret = tcp_rcv_state_process(child, skb); ++ bh_unlock_sock(child); ++ sock_put(child); ++ ++ return ret; ++} ++ ++static void __mptcp_get_info(const struct sock *meta_sk, ++ struct mptcp_meta_info *info) ++{ ++ const struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ u32 now = tcp_jiffies32; ++ ++ memset(info, 0, sizeof(*info)); ++ ++ info->mptcpi_state = meta_sk->sk_state; ++ info->mptcpi_retransmits = meta_icsk->icsk_retransmits; ++ info->mptcpi_probes = meta_icsk->icsk_probes_out; ++ info->mptcpi_backoff = meta_icsk->icsk_backoff; ++ ++ info->mptcpi_rto = jiffies_to_usecs(meta_icsk->icsk_rto); ++ ++ info->mptcpi_unacked = meta_tp->packets_out; ++ ++ info->mptcpi_last_data_sent = jiffies_to_msecs(now - meta_tp->lsndtime); ++ info->mptcpi_last_data_recv = jiffies_to_msecs(now - meta_icsk->icsk_ack.lrcvtime); ++ info->mptcpi_last_ack_recv = jiffies_to_msecs(now - meta_tp->rcv_tstamp); ++ ++ info->mptcpi_total_retrans = meta_tp->total_retrans; ++ ++ info->mptcpi_bytes_acked = meta_tp->bytes_acked; ++ info->mptcpi_bytes_received = meta_tp->bytes_received; ++} ++ ++static void mptcp_get_sub_info(struct sock *sk, struct mptcp_sub_info *info) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ ++ memset(info, 0, sizeof(*info)); ++ ++ if (sk->sk_family == AF_INET) { ++ info->src_v4.sin_family = AF_INET; ++ info->src_v4.sin_port = inet->inet_sport; ++ ++ info->src_v4.sin_addr.s_addr = inet->inet_rcv_saddr; ++ if (!info->src_v4.sin_addr.s_addr) ++ info->src_v4.sin_addr.s_addr = inet->inet_saddr; ++ ++ info->dst_v4.sin_family = AF_INET; ++ info->dst_v4.sin_port = inet->inet_dport; ++ info->dst_v4.sin_addr.s_addr = inet->inet_daddr; ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ info->src_v6.sin6_family = AF_INET6; ++ info->src_v6.sin6_port = inet->inet_sport; ++ ++ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) ++ info->src_v6.sin6_addr = np->saddr; ++ else ++ info->src_v6.sin6_addr = sk->sk_v6_rcv_saddr; ++ ++ info->dst_v6.sin6_family = AF_INET6; ++ info->dst_v6.sin6_port = inet->inet_dport; ++ info->dst_v6.sin6_addr = sk->sk_v6_daddr; ++#endif ++ } ++} ++ ++int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen) ++{ ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ ++ struct mptcp_meta_info meta_info; ++ struct mptcp_info m_info; ++ ++ unsigned int info_len; ++ ++ /* Check again with the lock held */ ++ if (!mptcp(meta_tp)) ++ return -EINVAL; ++ ++ if (copy_from_user(&m_info, optval, optlen)) ++ return -EFAULT; ++ ++ if (m_info.meta_info) { ++ unsigned int len; ++ ++ __mptcp_get_info(meta_sk, &meta_info); ++ ++ /* Need to set this, if user thinks that tcp_info is bigger than ours */ ++ len = min_t(unsigned int, m_info.meta_len, sizeof(meta_info)); ++ m_info.meta_len = len; ++ ++ if (copy_to_user((void __user *)m_info.meta_info, &meta_info, len)) ++ return -EFAULT; ++ } ++ ++ /* Need to set this, if user thinks that tcp_info is bigger than ours */ ++ info_len = min_t(unsigned int, m_info.tcp_info_len, sizeof(struct tcp_info)); ++ m_info.tcp_info_len = info_len; ++ ++ if (m_info.initial) { ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ ++ if (mpcb->master_sk) { ++ struct tcp_info info; ++ ++ tcp_get_info(mpcb->master_sk, &info, true); ++ if (copy_to_user((void __user *)m_info.initial, &info, info_len)) ++ return -EFAULT; ++ } else if (meta_tp->record_master_info && mpcb->master_info) { ++ if (copy_to_user((void __user *)m_info.initial, mpcb->master_info, info_len)) ++ return -EFAULT; ++ } else { ++ return meta_tp->record_master_info ? -ENOMEM : -EINVAL; ++ } ++ } ++ ++ if (m_info.subflows) { ++ unsigned int len, sub_len = 0; ++ struct mptcp_tcp_sock *mptcp; ++ char __user *ptr; ++ ++ ptr = (char __user *)m_info.subflows; ++ len = m_info.sub_len; ++ ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ struct tcp_info t_info; ++ unsigned int tmp_len; ++ ++ tcp_get_info(mptcp_to_sock(mptcp), &t_info, true); ++ ++ tmp_len = min_t(unsigned int, len, info_len); ++ len -= tmp_len; ++ ++ if (copy_to_user(ptr, &t_info, tmp_len)) ++ return -EFAULT; ++ ++ ptr += tmp_len; ++ sub_len += tmp_len; ++ ++ if (len == 0) ++ break; ++ } ++ ++ m_info.sub_len = sub_len; ++ } ++ ++ if (m_info.subflow_info) { ++ unsigned int len, sub_info_len, total_sub_info_len = 0; ++ struct mptcp_tcp_sock *mptcp; ++ char __user *ptr; ++ ++ ptr = (char __user *)m_info.subflow_info; ++ len = m_info.total_sub_info_len; ++ ++ sub_info_len = min_t(unsigned int, m_info.sub_info_len, ++ sizeof(struct mptcp_sub_info)); ++ m_info.sub_info_len = sub_info_len; ++ ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ struct mptcp_sub_info m_sub_info; ++ unsigned int tmp_len; ++ ++ mptcp_get_sub_info(mptcp_to_sock(mptcp), &m_sub_info); ++ ++ tmp_len = min_t(unsigned int, len, sub_info_len); ++ len -= tmp_len; ++ ++ if (copy_to_user(ptr, &m_sub_info, tmp_len)) ++ return -EFAULT; ++ ++ ptr += tmp_len; ++ total_sub_info_len += tmp_len; ++ ++ if (len == 0) ++ break; ++ } ++ ++ m_info.total_sub_info_len = total_sub_info_len; ++ } ++ ++ if (copy_to_user(optval, &m_info, optlen)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++void mptcp_clear_sk(struct sock *sk, int size) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ /* we do not want to clear tk_table field, because of RCU lookups */ ++ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table.next)); ++ ++ size -= offsetof(struct tcp_sock, tk_table.pprev); ++ memset((char *)&tp->tk_table.pprev, 0, size); ++} ++ ++static const struct snmp_mib mptcp_snmp_list[] = { ++ SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), ++ SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), ++ SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), ++ SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), ++ SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), ++ SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), ++ SNMP_MIB_ITEM("MPCapableRetransFallback", MPTCP_MIB_MPCAPABLERETRANSFALLBACK), ++ SNMP_MIB_ITEM("MPTCPCsumEnabled", MPTCP_MIB_CSUMENABLED), ++ SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), ++ SNMP_MIB_ITEM("MPFailRX", MPTCP_MIB_MPFAILRX), ++ SNMP_MIB_ITEM("MPCsumFail", MPTCP_MIB_CSUMFAIL), ++ SNMP_MIB_ITEM("MPFastcloseRX", MPTCP_MIB_FASTCLOSERX), ++ SNMP_MIB_ITEM("MPFastcloseTX", MPTCP_MIB_FASTCLOSETX), ++ SNMP_MIB_ITEM("MPFallbackAckSub", MPTCP_MIB_FBACKSUB), ++ SNMP_MIB_ITEM("MPFallbackAckInit", MPTCP_MIB_FBACKINIT), ++ SNMP_MIB_ITEM("MPFallbackDataSub", MPTCP_MIB_FBDATASUB), ++ SNMP_MIB_ITEM("MPFallbackDataInit", MPTCP_MIB_FBDATAINIT), ++ SNMP_MIB_ITEM("MPRemoveAddrSubDelete", MPTCP_MIB_REMADDRSUB), ++ SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), ++ SNMP_MIB_ITEM("MPJoinAlreadyFallenback", MPTCP_MIB_JOINFALLBACK), ++ SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), ++ SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), ++ SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), ++ SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), ++ SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), ++ SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), ++ SNMP_MIB_ITEM("MPJoinAckMissing", MPTCP_MIB_JOINACKFAIL), ++ SNMP_MIB_ITEM("MPJoinAckRTO", MPTCP_MIB_JOINACKRTO), ++ SNMP_MIB_ITEM("MPJoinAckRexmit", MPTCP_MIB_JOINACKRXMIT), ++ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), ++ SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), ++ SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), ++ SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), ++ SNMP_MIB_ITEM("DSSTrimHead", MPTCP_MIB_DSSTRIMHEAD), ++ SNMP_MIB_ITEM("DSSSplitTail", MPTCP_MIB_DSSSPLITTAIL), ++ SNMP_MIB_ITEM("DSSPurgeOldSubSegs", MPTCP_MIB_PURGEOLD), ++ SNMP_MIB_ITEM("AddAddrRx", MPTCP_MIB_ADDADDRRX), ++ SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), ++ SNMP_MIB_ITEM("RemAddrRx", MPTCP_MIB_REMADDRRX), ++ SNMP_MIB_ITEM("RemAddrTx", MPTCP_MIB_REMADDRTX), ++ SNMP_MIB_ITEM("MPJoinAlternatePort", MPTCP_MIB_JOINALTERNATEPORT), ++ SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), ++ SNMP_MIB_SENTINEL ++}; ++ ++struct workqueue_struct *mptcp_wq; ++EXPORT_SYMBOL(mptcp_wq); ++ ++/* Output /proc/net/mptcp */ ++static int mptcp_pm_seq_show(struct seq_file *seq, void *v) ++{ ++ struct tcp_sock *meta_tp; ++ const struct net *net = seq->private; ++ unsigned int i, n = 0; ++ ++ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode"); ++ seq_putc(seq, '\n'); ++ ++ for (i = 0; i <= mptcp_tk_htable.mask; i++) { ++ struct hlist_nulls_node *node; ++ rcu_read_lock(); ++ local_bh_disable(); ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, ++ &mptcp_tk_htable.hashtable[i], ++ tk_table) { ++ struct sock *meta_sk = (struct sock *)meta_tp; ++ struct inet_sock *isk = inet_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ ++ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk))) ++ continue; ++ ++ if (!mpcb) ++ continue; ++ ++ if (capable(CAP_NET_ADMIN)) { ++ seq_printf(seq, "%4d: %04X %04X ", n++, ++ mpcb->mptcp_loc_token, ++ mpcb->mptcp_rem_token); ++ } else { ++ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1); ++ } ++ if (meta_sk->sk_family == AF_INET || ++ mptcp_v6_is_v4_mapped(meta_sk)) { ++ seq_printf(seq, " 0 %08X:%04X %08X:%04X ", ++ isk->inet_rcv_saddr, ++ ntohs(isk->inet_sport), ++ isk->inet_daddr, ++ ntohs(isk->inet_dport)); ++#if IS_ENABLED(CONFIG_IPV6) ++ } else if (meta_sk->sk_family == AF_INET6) { ++ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr; ++ struct in6_addr *dst = &meta_sk->sk_v6_daddr; ++ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X", ++ src->s6_addr32[0], src->s6_addr32[1], ++ src->s6_addr32[2], src->s6_addr32[3], ++ ntohs(isk->inet_sport), ++ dst->s6_addr32[0], dst->s6_addr32[1], ++ dst->s6_addr32[2], dst->s6_addr32[3], ++ ntohs(isk->inet_dport)); ++#endif ++ } ++ ++ seq_printf(seq, " %02X %02X %08X:%08X %lu", ++ meta_sk->sk_state, mptcp_subflow_count(mpcb), ++ meta_tp->write_seq - meta_tp->snd_una, ++ max_t(int, meta_tp->rcv_nxt - ++ meta_tp->copied_seq, 0), ++ sock_i_ino(meta_sk)); ++ seq_putc(seq, '\n'); ++ } ++ ++ local_bh_enable(); ++ rcu_read_unlock(); ++ } ++ ++ return 0; ++} ++ ++static int mptcp_snmp_seq_show(struct seq_file *seq, void *v) ++{ ++ struct net *net = seq->private; ++ int i; ++ ++ for (i = 0; mptcp_snmp_list[i].name != NULL; i++) ++ seq_printf(seq, "%-32s\t%ld\n", mptcp_snmp_list[i].name, ++ snmp_fold_field(net->mptcp.mptcp_statistics, ++ mptcp_snmp_list[i].entry)); ++ ++ return 0; ++} ++ ++static int mptcp_pm_init_net(struct net *net) ++{ ++ net->mptcp.mptcp_statistics = alloc_percpu(struct mptcp_mib); ++ if (!net->mptcp.mptcp_statistics) ++ goto out_mptcp_mibs; ++ ++#ifdef CONFIG_PROC_FS ++ net->mptcp.proc_net_mptcp = proc_net_mkdir(net, "mptcp_net", net->proc_net); ++ if (!net->mptcp.proc_net_mptcp) ++ goto out_proc_net_mptcp; ++ if (!proc_create_net_single("mptcp", S_IRUGO, net->mptcp.proc_net_mptcp, ++ mptcp_pm_seq_show, NULL)) ++ goto out_mptcp_net_mptcp; ++ if (!proc_create_net_single("snmp", S_IRUGO, net->mptcp.proc_net_mptcp, ++ mptcp_snmp_seq_show, NULL)) ++ goto out_mptcp_net_snmp; ++#endif ++ ++ return 0; ++ ++#ifdef CONFIG_PROC_FS ++out_mptcp_net_snmp: ++ remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp); ++out_mptcp_net_mptcp: ++ remove_proc_subtree("mptcp_net", net->proc_net); ++ net->mptcp.proc_net_mptcp = NULL; ++out_proc_net_mptcp: ++ free_percpu(net->mptcp.mptcp_statistics); ++#endif ++out_mptcp_mibs: ++ return -ENOMEM; ++} ++ ++static void mptcp_pm_exit_net(struct net *net) ++{ ++ remove_proc_entry("snmp", net->mptcp.proc_net_mptcp); ++ remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp); ++ remove_proc_subtree("mptcp_net", net->proc_net); ++ free_percpu(net->mptcp.mptcp_statistics); ++} ++ ++static struct pernet_operations mptcp_pm_proc_ops = { ++ .init = mptcp_pm_init_net, ++ .exit = mptcp_pm_exit_net, ++}; ++ ++static unsigned long mptcp_htable_entries __initdata; ++ ++static int __init set_mptcp_htable_entries(char *str) ++{ ++ ssize_t ret; ++ ++ if (!str) ++ return 0; ++ ++ ret = kstrtoul(str, 0, &mptcp_htable_entries); ++ if (ret) ++ return 0; ++ ++ return 1; ++} ++__setup("mptcp_htable_entries=", set_mptcp_htable_entries); ++ ++/* General initialization of mptcp */ ++void __init mptcp_init(void) ++{ ++ unsigned int i; ++ struct ctl_table_header *mptcp_sysctl; ++ ++ mptcp_sock_cache = kmem_cache_create("mptcp_sock", ++ sizeof(struct mptcp_tcp_sock), ++ 0, SLAB_HWCACHE_ALIGN, ++ NULL); ++ if (!mptcp_sock_cache) ++ goto mptcp_sock_cache_failed; ++ ++ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb), ++ 0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, ++ NULL); ++ if (!mptcp_cb_cache) ++ goto mptcp_cb_cache_failed; ++ ++ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw), ++ 0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, ++ NULL); ++ if (!mptcp_tw_cache) ++ goto mptcp_tw_cache_failed; ++ ++ get_random_bytes(&mptcp_secret, sizeof(mptcp_secret)); ++ ++ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); ++ if (!mptcp_wq) ++ goto alloc_workqueue_failed; ++ ++ mptcp_tk_htable.hashtable = ++ alloc_large_system_hash("MPTCP tokens", ++ sizeof(mptcp_tk_htable.hashtable[0]), ++ mptcp_htable_entries, ++ 18, /* one slot per 256KB of memory */ ++ 0, ++ NULL, ++ &mptcp_tk_htable.mask, ++ 1024, ++ mptcp_htable_entries ? 0 : 1024 * 1024); ++ ++ for (i = 0; i <= mptcp_tk_htable.mask; i++) ++ INIT_HLIST_NULLS_HEAD(&mptcp_tk_htable.hashtable[i], i); ++ ++ mptcp_reqsk_tk_htb.hashtable = ++ alloc_large_system_hash("MPTCP request tokens", ++ sizeof(mptcp_reqsk_tk_htb.hashtable[0]), ++ mptcp_htable_entries, ++ 18, /* one slot per 256KB of memory */ ++ 0, ++ NULL, ++ &mptcp_reqsk_tk_htb.mask, ++ 1024, ++ mptcp_htable_entries ? 0 : 1024 * 1024); ++ ++ for (i = 0; i <= mptcp_reqsk_tk_htb.mask; i++) ++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb.hashtable[i], i); ++ ++ ++ spin_lock_init(&mptcp_tk_hashlock); ++ ++ if (register_pernet_subsys(&mptcp_pm_proc_ops)) ++ goto pernet_failed; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (mptcp_pm_v6_init()) ++ goto mptcp_pm_v6_failed; ++#endif ++ if (mptcp_pm_v4_init()) ++ goto mptcp_pm_v4_failed; ++ ++ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table); ++ if (!mptcp_sysctl) ++ goto register_sysctl_failed; ++ ++ if (mptcp_register_path_manager(&mptcp_pm_default)) ++ goto register_pm_failed; ++ ++ if (mptcp_register_scheduler(&mptcp_sched_default)) ++ goto register_sched_failed; ++ ++ pr_info("MPTCP: Unstable branch"); ++ ++ mptcp_init_failed = false; ++ ++ return; ++ ++register_sched_failed: ++ mptcp_unregister_path_manager(&mptcp_pm_default); ++register_pm_failed: ++ unregister_net_sysctl_table(mptcp_sysctl); ++register_sysctl_failed: ++ mptcp_pm_v4_undo(); ++mptcp_pm_v4_failed: ++#if IS_ENABLED(CONFIG_IPV6) ++ mptcp_pm_v6_undo(); ++mptcp_pm_v6_failed: ++#endif ++ unregister_pernet_subsys(&mptcp_pm_proc_ops); ++pernet_failed: ++ destroy_workqueue(mptcp_wq); ++alloc_workqueue_failed: ++ kmem_cache_destroy(mptcp_tw_cache); ++mptcp_tw_cache_failed: ++ kmem_cache_destroy(mptcp_cb_cache); ++mptcp_cb_cache_failed: ++ kmem_cache_destroy(mptcp_sock_cache); ++mptcp_sock_cache_failed: ++ mptcp_init_failed = true; ++} +diff --git a/net/mptcp/mptcp_ecf.c b/net/mptcp/mptcp_ecf.c +new file mode 100644 +index 000000000000..6b976b2b0c72 +--- /dev/null ++++ b/net/mptcp/mptcp_ecf.c +@@ -0,0 +1,195 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* MPTCP ECF Scheduler ++ * ++ * Algorithm Design: ++ * Yeon-sup Lim ++ * Don Towsley ++ * Erich M. Nahum ++ * Richard J. Gibbens ++ * ++ * Initial Implementation: ++ * Yeon-sup Lim ++ * ++ * Additional Authors: ++ * Daniel Weber ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++ ++static unsigned int mptcp_ecf_r_beta __read_mostly = 4; /* beta = 1/r_beta = 0.25 */ ++module_param(mptcp_ecf_r_beta, int, 0644); ++MODULE_PARM_DESC(mptcp_ecf_r_beta, "beta for ECF"); ++ ++struct ecfsched_priv { ++ u32 last_rbuf_opti; ++}; ++ ++struct ecfsched_cb { ++ u32 switching_margin; /* this is "waiting" in algorithm description */ ++}; ++ ++static struct ecfsched_priv *ecfsched_get_priv(const struct tcp_sock *tp) ++{ ++ return (struct ecfsched_priv *)&tp->mptcp->mptcp_sched[0]; ++} ++ ++static struct ecfsched_cb *ecfsched_get_cb(const struct tcp_sock *tp) ++{ ++ return (struct ecfsched_cb *)&tp->mpcb->mptcp_sched[0]; ++} ++ ++/* This is the ECF scheduler. This function decides on which flow to send ++ * a given MSS. If all subflows are found to be busy or the currently best ++ * subflow is estimated to be slower than waiting for minsk, NULL is returned. ++ */ ++static struct sock *ecf_get_available_subflow(struct sock *meta_sk, ++ struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sock *bestsk, *minsk = NULL; ++ struct tcp_sock *besttp; ++ struct mptcp_tcp_sock *mptcp; ++ struct ecfsched_cb *ecf_cb = ecfsched_get_cb(tcp_sk(meta_sk)); ++ u32 min_srtt = U32_MAX; ++ u32 sub_sndbuf = 0; ++ u32 sub_packets_out = 0; ++ ++ /* Answer data_fin on same subflow!!! */ ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && ++ skb && mptcp_is_data_fin(skb)) { ++ mptcp_for_each_sub(mpcb, mptcp) { ++ bestsk = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index && ++ mptcp_is_available(bestsk, skb, zero_wnd_test)) ++ return bestsk; ++ } ++ } ++ ++ /* First, find the overall best (fastest) subflow */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ bestsk = mptcp_to_sock(mptcp); ++ besttp = tcp_sk(bestsk); ++ ++ /* Set of states for which we are allowed to send data */ ++ if (!mptcp_sk_can_send(bestsk)) ++ continue; ++ ++ /* We do not send data on this subflow unless it is ++ * fully established, i.e. the 4th ack has been received. ++ */ ++ if (besttp->mptcp->pre_established) ++ continue; ++ ++ sub_sndbuf += bestsk->sk_wmem_queued; ++ sub_packets_out += besttp->packets_out; ++ ++ /* record minimal rtt */ ++ if (besttp->srtt_us < min_srtt) { ++ min_srtt = besttp->srtt_us; ++ minsk = bestsk; ++ } ++ } ++ ++ /* find the current best subflow according to the default scheduler */ ++ bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test); ++ ++ /* if we decided to use a slower flow, we have the option of not using it at all */ ++ if (bestsk && minsk && bestsk != minsk) { ++ u32 mss = tcp_current_mss(bestsk); /* assuming equal MSS */ ++ u32 sndbuf_meta = meta_sk->sk_wmem_queued; ++ u32 sndbuf_minus = sub_sndbuf; ++ u32 sndbuf = 0; ++ ++ u32 cwnd_f = tcp_sk(minsk)->snd_cwnd; ++ u32 srtt_f = tcp_sk(minsk)->srtt_us >> 3; ++ u32 rttvar_f = tcp_sk(minsk)->rttvar_us >> 1; ++ ++ u32 cwnd_s = tcp_sk(bestsk)->snd_cwnd; ++ u32 srtt_s = tcp_sk(bestsk)->srtt_us >> 3; ++ u32 rttvar_s = tcp_sk(bestsk)->rttvar_us >> 1; ++ ++ u32 delta = max(rttvar_f, rttvar_s); ++ ++ u32 x_f; ++ u64 lhs, rhs; /* to avoid overflow, using u64 */ ++ ++ if (tcp_sk(meta_sk)->packets_out > sub_packets_out) ++ sndbuf_minus += (tcp_sk(meta_sk)->packets_out - sub_packets_out) * mss; ++ ++ if (sndbuf_meta > sndbuf_minus) ++ sndbuf = sndbuf_meta - sndbuf_minus; ++ ++ /* we have something to send. ++ * at least one time tx over fastest subflow is required ++ */ ++ x_f = sndbuf > cwnd_f * mss ? sndbuf : cwnd_f * mss; ++ lhs = srtt_f * (x_f + cwnd_f * mss); ++ rhs = cwnd_f * mss * (srtt_s + delta); ++ ++ if (mptcp_ecf_r_beta * lhs < mptcp_ecf_r_beta * rhs + ecf_cb->switching_margin * rhs) { ++ u32 x_s = sndbuf > cwnd_s * mss ? sndbuf : cwnd_s * mss; ++ u64 lhs_s = srtt_s * x_s; ++ u64 rhs_s = cwnd_s * mss * (2 * srtt_f + delta); ++ ++ if (lhs_s >= rhs_s) { ++ /* too slower than fastest */ ++ ecf_cb->switching_margin = 1; ++ return NULL; ++ } ++ } else { ++ /* use slower one */ ++ ecf_cb->switching_margin = 0; ++ } ++ } ++ ++ return bestsk; ++} ++ ++static void ecfsched_init(struct sock *sk) ++{ ++ struct ecfsched_priv *ecf_p = ecfsched_get_priv(tcp_sk(sk)); ++ struct ecfsched_cb *ecf_cb = ecfsched_get_cb(tcp_sk(mptcp_meta_sk(sk))); ++ ++ ecf_p->last_rbuf_opti = tcp_jiffies32; ++ ecf_cb->switching_margin = 0; ++} ++ ++struct mptcp_sched_ops mptcp_sched_ecf = { ++ .get_subflow = ecf_get_available_subflow, ++ .next_segment = mptcp_next_segment, ++ .init = ecfsched_init, ++ .name = "ecf", ++ .owner = THIS_MODULE, ++}; ++ ++static int __init ecf_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct ecfsched_priv) > MPTCP_SCHED_SIZE); ++ BUILD_BUG_ON(sizeof(struct ecfsched_cb) > MPTCP_SCHED_DATA_SIZE); ++ ++ if (mptcp_register_scheduler(&mptcp_sched_ecf)) ++ return -1; ++ ++ return 0; ++} ++ ++static void ecf_unregister(void) ++{ ++ mptcp_unregister_scheduler(&mptcp_sched_ecf); ++} ++ ++module_init(ecf_register); ++module_exit(ecf_unregister); ++ ++MODULE_AUTHOR("Yeon-sup Lim, Daniel Weber"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("ECF (Earliest Completion First) scheduler for MPTCP, based on default minimum RTT scheduler"); ++MODULE_VERSION("0.95"); +diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c +new file mode 100644 +index 000000000000..5424960256e6 +--- /dev/null ++++ b/net/mptcp/mptcp_fullmesh.c +@@ -0,0 +1,1938 @@ ++#include ++#include ++ ++#include ++#include ++ ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#include ++#endif ++ ++enum { ++ MPTCP_EVENT_ADD = 1, ++ MPTCP_EVENT_DEL, ++ MPTCP_EVENT_MOD, ++}; ++ ++#define MPTCP_SUBFLOW_RETRY_DELAY 1000 ++ ++/* Max number of local or remote addresses we can store. ++ * When changing, see the bitfield below in fullmesh_rem4/6. ++ */ ++#define MPTCP_MAX_ADDR 8 ++ ++struct fullmesh_rem4 { ++ u8 rem4_id; ++ u8 bitfield; ++ u8 retry_bitfield; ++ __be16 port; ++ struct in_addr addr; ++}; ++ ++struct fullmesh_rem6 { ++ u8 rem6_id; ++ u8 bitfield; ++ u8 retry_bitfield; ++ __be16 port; ++ struct in6_addr addr; ++}; ++ ++struct mptcp_loc_addr { ++ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; ++ u8 loc4_bits; ++ u8 next_v4_index; ++ ++ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; ++ u8 loc6_bits; ++ u8 next_v6_index; ++ struct rcu_head rcu; ++}; ++ ++struct mptcp_addr_event { ++ struct list_head list; ++ unsigned short family; ++ u8 code:7, ++ low_prio:1; ++ int if_idx; ++ union inet_addr addr; ++}; ++ ++struct fullmesh_priv { ++ /* Worker struct for subflow establishment */ ++ struct work_struct subflow_work; ++ /* Delayed worker, when the routing-tables are not yet ready. */ ++ struct delayed_work subflow_retry_work; ++ ++ /* Remote addresses */ ++ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR]; ++ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR]; ++ ++ struct mptcp_cb *mpcb; ++ ++ u16 remove_addrs; /* Addresses to remove */ ++ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */ ++ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */ ++ ++ u8 add_addr; /* Are we sending an add_addr? */ ++ ++ u8 rem4_bits; ++ u8 rem6_bits; ++ ++ /* Have we established the additional subflows for primary pair? */ ++ u8 first_pair:1; ++}; ++ ++struct mptcp_fm_ns { ++ struct mptcp_loc_addr __rcu *local; ++ spinlock_t local_lock; /* Protecting the above pointer */ ++ struct list_head events; ++ struct delayed_work address_worker; ++ ++ struct net *net; ++}; ++ ++static int num_subflows __read_mostly = 1; ++module_param(num_subflows, int, 0644); ++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per pair of IP addresses of MPTCP connection"); ++ ++static int create_on_err __read_mostly; ++module_param(create_on_err, int, 0644); ++MODULE_PARM_DESC(create_on_err, "recreate the subflow upon a timeout"); ++ ++static struct mptcp_pm_ops full_mesh __read_mostly; ++ ++static void full_mesh_create_subflows(struct sock *meta_sk); ++ ++static struct mptcp_fm_ns *fm_get_ns(const struct net *net) ++{ ++ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH]; ++} ++ ++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb) ++{ ++ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; ++} ++ ++/* Find the first free index in the bitfield */ ++static int __mptcp_find_free_index(u8 bitfield, u8 base) ++{ ++ int i; ++ ++ /* There are anyways no free bits... */ ++ if (bitfield == 0xff) ++ goto exit; ++ ++ i = ffs(~(bitfield >> base)) - 1; ++ if (i < 0) ++ goto exit; ++ ++ /* No free bits when starting at base, try from 0 on */ ++ if (i + base >= sizeof(bitfield) * 8) ++ return __mptcp_find_free_index(bitfield, 0); ++ ++ return i + base; ++exit: ++ return -1; ++} ++ ++static int mptcp_find_free_index(u8 bitfield) ++{ ++ return __mptcp_find_free_index(bitfield, 0); ++} ++ ++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb, ++ const struct in_addr *addr, ++ __be16 port, u8 id) ++{ ++ int i; ++ struct fullmesh_rem4 *rem4; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ rem4 = &fmp->remaddr4[i]; ++ ++ /* Address is already in the list --- continue */ ++ if (rem4->rem4_id == id && ++ rem4->addr.s_addr == addr->s_addr && rem4->port == port) ++ return; ++ ++ /* This may be the case, when the peer is behind a NAT. He is ++ * trying to JOIN, thus sending the JOIN with a certain ID. ++ * However the src_addr of the IP-packet has been changed. We ++ * update the addr in the list, because this is the address as ++ * OUR BOX sees it. ++ */ ++ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) { ++ /* update the address */ ++ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n", ++ __func__, &rem4->addr.s_addr, ++ &addr->s_addr, id); ++ rem4->addr.s_addr = addr->s_addr; ++ rem4->port = port; ++ mpcb->list_rcvd = 1; ++ return; ++ } ++ } ++ ++ i = mptcp_find_free_index(fmp->rem4_bits); ++ /* Do we have already the maximum number of local/remote addresses? */ ++ if (i < 0) { ++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n", ++ __func__, MPTCP_MAX_ADDR, &addr->s_addr); ++ return; ++ } ++ ++ rem4 = &fmp->remaddr4[i]; ++ ++ /* Address is not known yet, store it */ ++ rem4->addr.s_addr = addr->s_addr; ++ rem4->port = port; ++ rem4->bitfield = 0; ++ rem4->retry_bitfield = 0; ++ rem4->rem4_id = id; ++ mpcb->list_rcvd = 1; ++ fmp->rem4_bits |= (1 << i); ++ ++ return; ++} ++ ++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb, ++ const struct in6_addr *addr, ++ __be16 port, u8 id) ++{ ++ int i; ++ struct fullmesh_rem6 *rem6; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ rem6 = &fmp->remaddr6[i]; ++ ++ /* Address is already in the list --- continue */ ++ if (rem6->rem6_id == id && ++ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port) ++ return; ++ ++ /* This may be the case, when the peer is behind a NAT. He is ++ * trying to JOIN, thus sending the JOIN with a certain ID. ++ * However the src_addr of the IP-packet has been changed. We ++ * update the addr in the list, because this is the address as ++ * OUR BOX sees it. ++ */ ++ if (rem6->rem6_id == id) { ++ /* update the address */ ++ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n", ++ __func__, &rem6->addr, addr, id); ++ rem6->addr = *addr; ++ rem6->port = port; ++ mpcb->list_rcvd = 1; ++ return; ++ } ++ } ++ ++ i = mptcp_find_free_index(fmp->rem6_bits); ++ /* Do we have already the maximum number of local/remote addresses? */ ++ if (i < 0) { ++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n", ++ __func__, MPTCP_MAX_ADDR, addr); ++ return; ++ } ++ ++ rem6 = &fmp->remaddr6[i]; ++ ++ /* Address is not known yet, store it */ ++ rem6->addr = *addr; ++ rem6->port = port; ++ rem6->bitfield = 0; ++ rem6->retry_bitfield = 0; ++ rem6->rem6_id = id; ++ mpcb->list_rcvd = 1; ++ fmp->rem6_bits |= (1 << i); ++ ++ return; ++} ++ ++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id) ++{ ++ int i; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ if (fmp->remaddr4[i].rem4_id == id) { ++ /* remove address from bitfield */ ++ fmp->rem4_bits &= ~(1 << i); ++ ++ break; ++ } ++ } ++} ++ ++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id) ++{ ++ int i; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ if (fmp->remaddr6[i].rem6_id == id) { ++ /* remove address from bitfield */ ++ fmp->rem6_bits &= ~(1 << i); ++ ++ break; ++ } ++ } ++} ++ ++/* Sets the bitfield of the remote-address field */ ++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb, ++ const struct in_addr *addr, u8 index) ++{ ++ int i; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) { ++ fmp->remaddr4[i].bitfield |= (1 << index); ++ return; ++ } ++ } ++} ++ ++/* Sets the bitfield of the remote-address field */ ++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, ++ const struct in6_addr *addr, u8 index) ++{ ++ int i; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) { ++ fmp->remaddr6[i].bitfield |= (1 << index); ++ return; ++ } ++ } ++} ++ ++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb, ++ const union inet_addr *addr, ++ sa_family_t family, u8 id) ++{ ++ if (family == AF_INET) ++ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id); ++ else ++ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id); ++} ++ ++static void mptcp_v4_subflows(struct sock *meta_sk, ++ const struct mptcp_loc4 *loc, ++ struct mptcp_rem4 *rem) ++{ ++ int i; ++ ++ for (i = 1; i < num_subflows; i++) ++ mptcp_init4_subsockets(meta_sk, loc, rem); ++} ++ ++#if IS_ENABLED(CONFIG_IPV6) ++static void mptcp_v6_subflows(struct sock *meta_sk, ++ const struct mptcp_loc6 *loc, ++ struct mptcp_rem6 *rem) ++{ ++ int i; ++ ++ for (i = 1; i < num_subflows; i++) ++ mptcp_init6_subsockets(meta_sk, loc, rem); ++} ++#endif ++ ++static void retry_subflow_worker(struct work_struct *work) ++{ ++ struct delayed_work *delayed_work = container_of(work, ++ struct delayed_work, ++ work); ++ struct fullmesh_priv *fmp = container_of(delayed_work, ++ struct fullmesh_priv, ++ subflow_retry_work); ++ struct mptcp_cb *mpcb = fmp->mpcb; ++ struct sock *meta_sk = mpcb->meta_sk; ++ struct mptcp_loc_addr *mptcp_local; ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); ++ int iter = 0, i; ++ ++ /* We need a local (stable) copy of the address-list. Really, it is not ++ * such a big deal, if the address-list is not 100% up-to-date. ++ */ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference_bh(fm_ns->local); ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); ++ rcu_read_unlock_bh(); ++ ++ if (!mptcp_local) ++ return; ++ ++next_subflow: ++ if (iter) { ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ ++ cond_resched(); ++ } ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ if (!mptcp(tcp_sk(meta_sk))) ++ goto exit; ++ ++ iter++; ++ ++ if (sock_flag(meta_sk, SOCK_DEAD)) ++ goto exit; ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ struct fullmesh_rem4 *rem = &fmp->remaddr4[i]; ++ /* Do we need to retry establishing a subflow ? */ ++ if (rem->retry_bitfield) { ++ int i = mptcp_find_free_index(~rem->retry_bitfield); ++ struct mptcp_rem4 rem4; ++ ++ rem->bitfield |= (1 << i); ++ rem->retry_bitfield &= ~(1 << i); ++ ++ rem4.addr = rem->addr; ++ rem4.port = rem->port; ++ rem4.rem4_id = rem->rem4_id; ++ ++ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4); ++ mptcp_v4_subflows(meta_sk, ++ &mptcp_local->locaddr4[i], ++ &rem4); ++ goto next_subflow; ++ } ++ } ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ struct fullmesh_rem6 *rem = &fmp->remaddr6[i]; ++ ++ /* Do we need to retry establishing a subflow ? */ ++ if (rem->retry_bitfield) { ++ int i = mptcp_find_free_index(~rem->retry_bitfield); ++ struct mptcp_rem6 rem6; ++ ++ rem->bitfield |= (1 << i); ++ rem->retry_bitfield &= ~(1 << i); ++ ++ rem6.addr = rem->addr; ++ rem6.port = rem->port; ++ rem6.rem6_id = rem->rem6_id; ++ ++ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6); ++ mptcp_v6_subflows(meta_sk, ++ &mptcp_local->locaddr6[i], ++ &rem6); ++ goto next_subflow; ++ } ++ } ++#endif ++ ++exit: ++ kfree(mptcp_local); ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ sock_put(meta_sk); ++} ++ ++/** ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets ++ * ++ * This function uses a goto next_subflow, to allow releasing the lock between ++ * new subflows and giving other processes a chance to do some work on the ++ * socket and potentially finishing the communication. ++ **/ ++static void create_subflow_worker(struct work_struct *work) ++{ ++ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv, ++ subflow_work); ++ struct mptcp_cb *mpcb = fmp->mpcb; ++ struct sock *meta_sk = mpcb->meta_sk; ++ struct mptcp_loc_addr *mptcp_local; ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); ++ int iter = 0, retry = 0; ++ int i; ++ ++ /* We need a local (stable) copy of the address-list. Really, it is not ++ * such a big deal, if the address-list is not 100% up-to-date. ++ */ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference_bh(fm_ns->local); ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); ++ rcu_read_unlock_bh(); ++ ++ if (!mptcp_local) ++ return; ++ ++next_subflow: ++ if (iter) { ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ ++ cond_resched(); ++ } ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ if (sock_flag(meta_sk, SOCK_DEAD) || !mptcp(tcp_sk(meta_sk))) ++ goto exit; ++ ++ if (mpcb->master_sk && ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) ++ goto exit; ++ ++ /* Create the additional subflows for the first pair */ ++ if (fmp->first_pair == 0 && mpcb->master_sk) { ++ struct mptcp_loc4 loc; ++ struct mptcp_rem4 rem; ++ ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; ++ loc.loc4_id = 0; ++ loc.low_prio = 0; ++ loc.if_idx = mpcb->master_sk->sk_bound_dev_if; ++ ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; ++ rem.port = inet_sk(meta_sk)->inet_dport; ++ rem.rem4_id = 0; /* Default 0 */ ++ ++ mptcp_v4_subflows(meta_sk, &loc, &rem); ++ ++ fmp->first_pair = 1; ++ } ++ iter++; ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ struct fullmesh_rem4 *rem; ++ u8 remaining_bits; ++ ++ rem = &fmp->remaddr4[i]; ++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits; ++ ++ /* Are there still combinations to handle? */ ++ if (remaining_bits) { ++ int i = mptcp_find_free_index(~remaining_bits); ++ struct mptcp_rem4 rem4; ++ ++ rem->bitfield |= (1 << i); ++ ++ rem4.addr = rem->addr; ++ rem4.port = rem->port; ++ rem4.rem4_id = rem->rem4_id; ++ ++ /* If a route is not yet available then retry once */ ++ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], ++ &rem4) == -ENETUNREACH) ++ retry = rem->retry_bitfield |= (1 << i); ++ else ++ mptcp_v4_subflows(meta_sk, ++ &mptcp_local->locaddr4[i], ++ &rem4); ++ goto next_subflow; ++ } ++ } ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (fmp->first_pair == 0 && mpcb->master_sk) { ++ struct mptcp_loc6 loc; ++ struct mptcp_rem6 rem; ++ ++ loc.addr = inet6_sk(meta_sk)->saddr; ++ loc.loc6_id = 0; ++ loc.low_prio = 0; ++ loc.if_idx = mpcb->master_sk->sk_bound_dev_if; ++ ++ rem.addr = meta_sk->sk_v6_daddr; ++ rem.port = inet_sk(meta_sk)->inet_dport; ++ rem.rem6_id = 0; /* Default 0 */ ++ ++ mptcp_v6_subflows(meta_sk, &loc, &rem); ++ ++ fmp->first_pair = 1; ++ } ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ struct fullmesh_rem6 *rem; ++ u8 remaining_bits; ++ ++ rem = &fmp->remaddr6[i]; ++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits; ++ ++ /* Are there still combinations to handle? */ ++ if (remaining_bits) { ++ int i = mptcp_find_free_index(~remaining_bits); ++ struct mptcp_rem6 rem6; ++ ++ rem->bitfield |= (1 << i); ++ ++ rem6.addr = rem->addr; ++ rem6.port = rem->port; ++ rem6.rem6_id = rem->rem6_id; ++ ++ /* If a route is not yet available then retry once */ ++ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], ++ &rem6) == -ENETUNREACH) ++ retry = rem->retry_bitfield |= (1 << i); ++ else ++ mptcp_v6_subflows(meta_sk, ++ &mptcp_local->locaddr6[i], ++ &rem6); ++ goto next_subflow; ++ } ++ } ++#endif ++ ++ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) { ++ sock_hold(meta_sk); ++ refcount_inc(&mpcb->mpcb_refcnt); ++ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work, ++ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY)); ++ } ++ ++exit: ++ kfree(mptcp_local); ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ sock_put(meta_sk); ++} ++ ++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ struct sock *sk = mptcp_select_ack_sock(meta_sk); ++ ++ fmp->remove_addrs |= (1 << addr_id); ++ mpcb->addr_signal = 1; ++ ++ if (sk) ++ tcp_send_ack(sk); ++} ++ ++static void update_addr_bitfields(struct sock *meta_sk, ++ const struct mptcp_loc_addr *mptcp_local) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ int i; ++ ++ /* The bits in announced_addrs_* always match with loc*_bits. So, a ++ * simple & operation unsets the correct bits, because these go from ++ * announced to non-announced ++ */ ++ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits; ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits; ++ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits; ++ } ++ ++ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits; ++ ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits; ++ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits; ++ } ++} ++ ++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local, ++ sa_family_t family, const union inet_addr *addr, ++ int if_idx) ++{ ++ int i; ++ u8 loc_bits; ++ bool found = false; ++ ++ if (family == AF_INET) ++ loc_bits = mptcp_local->loc4_bits; ++ else ++ loc_bits = mptcp_local->loc6_bits; ++ ++ mptcp_for_each_bit_set(loc_bits, i) { ++ if (family == AF_INET && ++ (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) && ++ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) { ++ found = true; ++ break; ++ } ++ if (family == AF_INET6 && ++ (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) && ++ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr, ++ &addr->in6)) { ++ found = true; ++ break; ++ } ++ } ++ ++ if (!found) ++ return -1; ++ ++ return i; ++} ++ ++static int mptcp_find_address_transp(const struct mptcp_loc_addr *mptcp_local, ++ sa_family_t family, int if_idx) ++{ ++ bool found = false; ++ u8 loc_bits; ++ int i; ++ ++ if (family == AF_INET) ++ loc_bits = mptcp_local->loc4_bits; ++ else ++ loc_bits = mptcp_local->loc6_bits; ++ ++ mptcp_for_each_bit_set(loc_bits, i) { ++ if (family == AF_INET && ++ (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx)) { ++ found = true; ++ break; ++ } ++ if (family == AF_INET6 && ++ (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx)) { ++ found = true; ++ break; ++ } ++ } ++ ++ if (!found) ++ return -1; ++ ++ return i; ++} ++ ++static void mptcp_address_worker(struct work_struct *work) ++{ ++ const struct delayed_work *delayed_work = container_of(work, ++ struct delayed_work, ++ work); ++ struct mptcp_fm_ns *fm_ns = container_of(delayed_work, ++ struct mptcp_fm_ns, ++ address_worker); ++ struct net *net = fm_ns->net; ++ struct mptcp_addr_event *event = NULL; ++ struct mptcp_loc_addr *mptcp_local, *old; ++ int i, id = -1; /* id is used in the socket-code on a delete-event */ ++ bool success; /* Used to indicate if we succeeded handling the event */ ++ ++next_event: ++ success = false; ++ kfree(event); ++ ++ /* First, let's dequeue an event from our event-list */ ++ rcu_read_lock_bh(); ++ spin_lock(&fm_ns->local_lock); ++ ++ event = list_first_entry_or_null(&fm_ns->events, ++ struct mptcp_addr_event, list); ++ if (!event) { ++ spin_unlock(&fm_ns->local_lock); ++ rcu_read_unlock_bh(); ++ return; ++ } ++ ++ list_del(&event->list); ++ ++ mptcp_local = rcu_dereference_bh(fm_ns->local); ++ ++ if (event->code == MPTCP_EVENT_DEL) { ++ id = mptcp_find_address(mptcp_local, event->family, ++ &event->addr, event->if_idx); ++ ++ /* Not in the list - so we don't care */ ++ if (id < 0) { ++ mptcp_debug("%s could not find id\n", __func__); ++ goto duno; ++ } ++ ++ old = mptcp_local; ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), ++ GFP_ATOMIC); ++ if (!mptcp_local) ++ goto duno; ++ ++ if (event->family == AF_INET) ++ mptcp_local->loc4_bits &= ~(1 << id); ++ else ++ mptcp_local->loc6_bits &= ~(1 << id); ++ ++ rcu_assign_pointer(fm_ns->local, mptcp_local); ++ kfree_rcu(old, rcu); ++ } else { ++ int i = mptcp_find_address(mptcp_local, event->family, ++ &event->addr, event->if_idx); ++ int j = i; ++ ++ if (j < 0) { ++ /* Not in the list, so we have to find an empty slot */ ++ if (event->family == AF_INET) ++ i = __mptcp_find_free_index(mptcp_local->loc4_bits, ++ mptcp_local->next_v4_index); ++ if (event->family == AF_INET6) ++ i = __mptcp_find_free_index(mptcp_local->loc6_bits, ++ mptcp_local->next_v6_index); ++ ++ if (i < 0) { ++ mptcp_debug("%s no more space\n", __func__); ++ goto duno; ++ } ++ ++ /* It might have been a MOD-event. */ ++ event->code = MPTCP_EVENT_ADD; ++ } else { ++ /* Let's check if anything changes */ ++ if (event->family == AF_INET && ++ event->low_prio == mptcp_local->locaddr4[i].low_prio) ++ goto duno; ++ ++ if (event->family == AF_INET6 && ++ event->low_prio == mptcp_local->locaddr6[i].low_prio) ++ goto duno; ++ } ++ ++ old = mptcp_local; ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), ++ GFP_ATOMIC); ++ if (!mptcp_local) ++ goto duno; ++ ++ if (event->family == AF_INET) { ++ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr; ++ mptcp_local->locaddr4[i].loc4_id = i + 1; ++ mptcp_local->locaddr4[i].low_prio = event->low_prio; ++ mptcp_local->locaddr4[i].if_idx = event->if_idx; ++ ++ mptcp_debug("%s updated IP %pI4 on ifidx %u prio %u id %u\n", ++ __func__, &event->addr.in.s_addr, ++ event->if_idx, event->low_prio, i + 1); ++ } else { ++ mptcp_local->locaddr6[i].addr = event->addr.in6; ++ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR; ++ mptcp_local->locaddr6[i].low_prio = event->low_prio; ++ mptcp_local->locaddr6[i].if_idx = event->if_idx; ++ ++ mptcp_debug("%s updated IP %pI6 on ifidx %u prio %u id %u\n", ++ __func__, &event->addr.in6, ++ event->if_idx, event->low_prio, i + MPTCP_MAX_ADDR); ++ } ++ ++ if (j < 0) { ++ if (event->family == AF_INET) { ++ mptcp_local->loc4_bits |= (1 << i); ++ mptcp_local->next_v4_index = i + 1; ++ } else { ++ mptcp_local->loc6_bits |= (1 << i); ++ mptcp_local->next_v6_index = i + 1; ++ } ++ } ++ ++ rcu_assign_pointer(fm_ns->local, mptcp_local); ++ kfree_rcu(old, rcu); ++ } ++ success = true; ++ ++duno: ++ spin_unlock(&fm_ns->local_lock); ++ rcu_read_unlock_bh(); ++ ++ if (!success) ++ goto next_event; ++ ++ /* Now we iterate over the MPTCP-sockets and apply the event. */ ++ for (i = 0; i <= mptcp_tk_htable.mask; i++) { ++ const struct hlist_nulls_node *node; ++ struct tcp_sock *meta_tp; ++ ++ rcu_read_lock_bh(); ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, ++ &mptcp_tk_htable.hashtable[i], ++ tk_table) { ++ struct sock *meta_sk = (struct sock *)meta_tp, *sk; ++ bool meta_v4 = meta_sk->sk_family == AF_INET; ++ struct mptcp_cb *mpcb; ++ ++ if (sock_net(meta_sk) != net) ++ continue; ++ ++ if (meta_v4) { ++ /* skip IPv6 events if meta is IPv4 */ ++ if (event->family == AF_INET6) ++ continue; ++ } else if (event->family == AF_INET && meta_sk->sk_ipv6only) { ++ /* skip IPv4 events if IPV6_V6ONLY is set */ ++ continue; ++ } ++ ++ if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt))) ++ continue; ++ ++ bh_lock_sock(meta_sk); ++ ++ mpcb = meta_tp->mpcb; ++ if (!mpcb) ++ goto next; ++ ++ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) || ++ mptcp_in_infinite_mapping_weak(mpcb)) ++ goto next; ++ ++ /* May be that the pm has changed in-between */ ++ if (mpcb->pm_ops != &full_mesh) ++ goto next; ++ ++ if (sock_owned_by_user(meta_sk)) { ++ if (!test_and_set_bit(MPTCP_PATH_MANAGER_DEFERRED, ++ &meta_sk->sk_tsq_flags)) ++ sock_hold(meta_sk); ++ ++ goto next; ++ } ++ ++ if (event->code == MPTCP_EVENT_ADD) { ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ fmp->add_addr++; ++ mpcb->addr_signal = 1; ++ ++ sk = mptcp_select_ack_sock(meta_sk); ++ if (sk) ++ tcp_send_ack(sk); ++ ++ full_mesh_create_subflows(meta_sk); ++ } ++ ++ if (event->code == MPTCP_EVENT_DEL) { ++ struct mptcp_tcp_sock *mptcp; ++ struct mptcp_loc_addr *mptcp_local; ++ struct hlist_node *tmp; ++ bool found = false; ++ ++ mptcp_local = rcu_dereference_bh(fm_ns->local); ++ ++ /* In any case, we need to update our bitfields */ ++ if (id >= 0) ++ update_addr_bitfields(meta_sk, mptcp_local); ++ ++ /* Look for the socket and remove him */ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if ((event->family == AF_INET6 && ++ (sk->sk_family == AF_INET || ++ mptcp_v6_is_v4_mapped(sk))) || ++ (event->family == AF_INET && ++ (sk->sk_family == AF_INET6 && ++ !mptcp_v6_is_v4_mapped(sk)))) ++ continue; ++ ++ if (event->family == AF_INET && ++ (sk->sk_family == AF_INET || ++ mptcp_v6_is_v4_mapped(sk)) && ++ inet_sk(sk)->inet_saddr != event->addr.in.s_addr) ++ continue; ++ ++ if (event->family == AF_INET6 && ++ sk->sk_family == AF_INET6 && ++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) ++ continue; ++ ++ /* Reinject, so that pf = 1 and so we ++ * won't select this one as the ++ * ack-sock. ++ */ ++ mptcp_reinject_data(sk, 0); ++ ++ /* We announce the removal of this id */ ++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk); ++ ++ mptcp_sub_force_close(sk); ++ found = true; ++ } ++ ++ if (found) ++ goto next; ++ ++ /* The id may have been given by the event, ++ * matching on a local address. And it may not ++ * have matched on one of the above sockets, ++ * because the client never created a subflow. ++ * So, we have to finally remove it here. ++ */ ++ if (id >= 0) { ++ u8 loc_id = id ++ + (event->family == AF_INET ? 1 : MPTCP_MAX_ADDR); ++ announce_remove_addr(loc_id, meta_sk); ++ } ++ } ++ ++ if (event->code == MPTCP_EVENT_MOD) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ if (event->family == AF_INET && ++ (sk->sk_family == AF_INET || ++ mptcp_v6_is_v4_mapped(sk)) && ++ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) { ++ if (event->low_prio != tp->mptcp->low_prio) { ++ tp->mptcp->send_mp_prio = 1; ++ tp->mptcp->low_prio = event->low_prio; ++ ++ tcp_send_ack(sk); ++ } ++ } ++ ++ if (event->family == AF_INET6 && ++ sk->sk_family == AF_INET6 && ++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) { ++ if (event->low_prio != tp->mptcp->low_prio) { ++ tp->mptcp->send_mp_prio = 1; ++ tp->mptcp->low_prio = event->low_prio; ++ ++ tcp_send_ack(sk); ++ } ++ } ++ } ++ } ++next: ++ bh_unlock_sock(meta_sk); ++ sock_put(meta_sk); ++ } ++ rcu_read_unlock_bh(); ++ } ++ goto next_event; ++} ++ ++static struct mptcp_addr_event *lookup_similar_event(const struct net *net, ++ const struct mptcp_addr_event *event) ++{ ++ struct mptcp_addr_event *eventq; ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); ++ ++ list_for_each_entry(eventq, &fm_ns->events, list) { ++ if (eventq->family != event->family) ++ continue; ++ if (eventq->if_idx != event->if_idx) ++ continue; ++ if (event->family == AF_INET) { ++ if (eventq->addr.in.s_addr == event->addr.in.s_addr) ++ return eventq; ++ } else { ++ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6)) ++ return eventq; ++ } ++ } ++ return NULL; ++} ++ ++/* We already hold the net-namespace MPTCP-lock */ ++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event) ++{ ++ struct mptcp_addr_event *eventq = lookup_similar_event(net, event); ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); ++ ++ if (eventq) { ++ switch (event->code) { ++ case MPTCP_EVENT_DEL: ++ mptcp_debug("%s del old_code %u\n", __func__, eventq->code); ++ list_del(&eventq->list); ++ kfree(eventq); ++ break; ++ case MPTCP_EVENT_ADD: ++ mptcp_debug("%s add old_code %u\n", __func__, eventq->code); ++ eventq->low_prio = event->low_prio; ++ eventq->code = MPTCP_EVENT_ADD; ++ return; ++ case MPTCP_EVENT_MOD: ++ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code); ++ eventq->low_prio = event->low_prio; ++ eventq->code = MPTCP_EVENT_MOD; ++ return; ++ } ++ } ++ ++ /* OK, we have to add the new address to the wait queue */ ++ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC); ++ if (!eventq) ++ return; ++ ++ list_add_tail(&eventq->list, &fm_ns->events); ++ ++ /* Create work-queue */ ++ if (!delayed_work_pending(&fm_ns->address_worker)) ++ queue_delayed_work(mptcp_wq, &fm_ns->address_worker, ++ msecs_to_jiffies(500)); ++} ++ ++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event, ++ struct net *net) ++{ ++ const struct net_device *netdev = ifa->ifa_dev->dev; ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); ++ struct mptcp_addr_event mpevent; ++ ++ if (ifa->ifa_scope > RT_SCOPE_LINK || ++ ipv4_is_loopback(ifa->ifa_local)) ++ return; ++ ++ spin_lock_bh(&fm_ns->local_lock); ++ ++ mpevent.family = AF_INET; ++ mpevent.addr.in.s_addr = ifa->ifa_local; ++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; ++ mpevent.if_idx = netdev->ifindex; ++ ++ if (event == NETDEV_DOWN || !netif_running(netdev) || ++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) ++ mpevent.code = MPTCP_EVENT_DEL; ++ else if (event == NETDEV_UP) ++ mpevent.code = MPTCP_EVENT_ADD; ++ else if (event == NETDEV_CHANGE) ++ mpevent.code = MPTCP_EVENT_MOD; ++ ++ mptcp_debug("%s created event for %pI4, code %u prio %u idx %u\n", __func__, ++ &ifa->ifa_local, mpevent.code, mpevent.low_prio, mpevent.if_idx); ++ add_pm_event(net, &mpevent); ++ ++ spin_unlock_bh(&fm_ns->local_lock); ++ return; ++} ++ ++/* React on IPv4-addr add/rem-events */ ++static int mptcp_pm_inetaddr_event(struct notifier_block *this, ++ unsigned long event, void *ptr) ++{ ++ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; ++ struct net *net = dev_net(ifa->ifa_dev->dev); ++ ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN || ++ event == NETDEV_CHANGE)) ++ return NOTIFY_DONE; ++ ++ addr4_event_handler(ifa, event, net); ++ ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block mptcp_pm_inetaddr_notifier = { ++ .notifier_call = mptcp_pm_inetaddr_event, ++}; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ ++static int inet6_addr_event(struct notifier_block *this, unsigned long event, ++ void *ptr); ++ ++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event, ++ struct net *net) ++{ ++ const struct net_device *netdev = ifa->idev->dev; ++ int addr_type = ipv6_addr_type(&ifa->addr); ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net); ++ struct mptcp_addr_event mpevent; ++ ++ if (ifa->scope > RT_SCOPE_LINK || ++ addr_type == IPV6_ADDR_ANY || ++ (addr_type & IPV6_ADDR_LOOPBACK) || ++ (addr_type & IPV6_ADDR_LINKLOCAL)) ++ return; ++ ++ spin_lock_bh(&fm_ns->local_lock); ++ ++ mpevent.family = AF_INET6; ++ mpevent.addr.in6 = ifa->addr; ++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; ++ mpevent.if_idx = netdev->ifindex; ++ ++ if (event == NETDEV_DOWN || !netif_running(netdev) || ++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) ++ mpevent.code = MPTCP_EVENT_DEL; ++ else if (event == NETDEV_UP) ++ mpevent.code = MPTCP_EVENT_ADD; ++ else if (event == NETDEV_CHANGE) ++ mpevent.code = MPTCP_EVENT_MOD; ++ ++ mptcp_debug("%s created event for %pI6, code %u prio %u idx %u\n", __func__, ++ &ifa->addr, mpevent.code, mpevent.low_prio, mpevent.if_idx); ++ add_pm_event(net, &mpevent); ++ ++ spin_unlock_bh(&fm_ns->local_lock); ++ return; ++} ++ ++/* React on IPv6-addr add/rem-events */ ++static int inet6_addr_event(struct notifier_block *this, unsigned long event, ++ void *ptr) ++{ ++ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr; ++ struct net *net = dev_net(ifa6->idev->dev); ++ ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN || ++ event == NETDEV_CHANGE)) ++ return NOTIFY_DONE; ++ ++ addr6_event_handler(ifa6, event, net); ++ ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block inet6_addr_notifier = { ++ .notifier_call = inet6_addr_event, ++}; ++ ++#endif ++ ++/* React on ifup/down-events */ ++static int netdev_event(struct notifier_block *this, unsigned long event, ++ void *ptr) ++{ ++ const struct net_device *dev = netdev_notifier_info_to_dev(ptr); ++ struct in_device *in_dev; ++#if IS_ENABLED(CONFIG_IPV6) ++ struct inet6_dev *in6_dev; ++#endif ++ ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN || ++ event == NETDEV_CHANGE)) ++ return NOTIFY_DONE; ++ ++ rcu_read_lock(); ++ in_dev = __in_dev_get_rtnl(dev); ++ ++ if (in_dev) { ++ struct in_ifaddr *ifa; ++ ++ in_dev_for_each_ifa_rcu(ifa, in_dev) { ++ mptcp_pm_inetaddr_event(NULL, event, ifa); ++ } ++ } ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ in6_dev = __in6_dev_get(dev); ++ ++ if (in6_dev) { ++ struct inet6_ifaddr *ifa6; ++ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) ++ inet6_addr_event(NULL, event, ifa6); ++ } ++#endif ++ ++ rcu_read_unlock(); ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block mptcp_pm_netdev_notifier = { ++ .notifier_call = netdev_event, ++}; ++ ++static void full_mesh_add_raddr(struct mptcp_cb *mpcb, ++ const union inet_addr *addr, ++ sa_family_t family, __be16 port, u8 id) ++{ ++ if (family == AF_INET) ++ mptcp_addv4_raddr(mpcb, &addr->in, port, id); ++ else ++ mptcp_addv6_raddr(mpcb, &addr->in6, port, id); ++} ++ ++static void full_mesh_new_session(const struct sock *meta_sk) ++{ ++ struct mptcp_loc_addr *mptcp_local; ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); ++ struct tcp_sock *master_tp = tcp_sk(mpcb->master_sk); ++ int i, index, if_idx = 0; ++ union inet_addr saddr, daddr; ++ sa_family_t family = AF_INET; ++ bool meta_v4 = meta_sk->sk_family == AF_INET; ++ ++ /* Init local variables necessary for the rest */ ++ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) { ++ saddr.ip = inet_sk(meta_sk)->inet_saddr; ++ daddr.ip = inet_sk(meta_sk)->inet_daddr; ++ if_idx = mpcb->master_sk->sk_bound_dev_if; ++ family = AF_INET; ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ saddr.in6 = inet6_sk(meta_sk)->saddr; ++ daddr.in6 = meta_sk->sk_v6_daddr; ++ if_idx = mpcb->master_sk->sk_bound_dev_if; ++ family = AF_INET6; ++#endif ++ } ++ ++ if (inet_sk(meta_sk)->transparent) ++ if_idx = inet_sk(meta_sk)->rx_dst_ifindex; ++ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference(fm_ns->local); ++ ++ if (inet_sk(meta_sk)->transparent) ++ index = mptcp_find_address_transp(mptcp_local, family, if_idx); ++ else ++ index = mptcp_find_address(mptcp_local, family, &saddr, if_idx); ++ if (index < 0) ++ goto fallback; ++ ++ if (family == AF_INET) ++ master_tp->mptcp->low_prio = mptcp_local->locaddr4[index].low_prio; ++ else ++ master_tp->mptcp->low_prio = mptcp_local->locaddr6[index].low_prio; ++ master_tp->mptcp->send_mp_prio = master_tp->mptcp->low_prio; ++ ++ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0); ++ mptcp_set_init_addr_bit(mpcb, &daddr, family, index); ++ ++ /* Initialize workqueue-struct */ ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker); ++ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker); ++ fmp->mpcb = mpcb; ++ ++ if (!meta_v4 && meta_sk->sk_ipv6only) ++ goto skip_ipv4; ++ ++ /* Look for the address among the local addresses */ ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { ++ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr; ++ ++ /* We do not need to announce the initial subflow's address again */ ++ if (family == AF_INET && ++ (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) && ++ saddr.ip == ifa_address) ++ continue; ++ ++ fmp->add_addr++; ++ mpcb->addr_signal = 1; ++ } ++ ++skip_ipv4: ++#if IS_ENABLED(CONFIG_IPV6) ++ /* skip IPv6 addresses if meta-socket is IPv4 */ ++ if (meta_v4) ++ goto skip_ipv6; ++ ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { ++ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr; ++ ++ /* We do not need to announce the initial subflow's address again */ ++ if (family == AF_INET6 && ++ (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) && ++ ipv6_addr_equal(&saddr.in6, ifa6)) ++ continue; ++ ++ fmp->add_addr++; ++ mpcb->addr_signal = 1; ++ } ++ ++skip_ipv6: ++#endif ++ ++ rcu_read_unlock_bh(); ++ ++ if (family == AF_INET) ++ fmp->announced_addrs_v4 |= (1 << index); ++ else ++ fmp->announced_addrs_v6 |= (1 << index); ++ ++ for (i = fmp->add_addr; i && fmp->add_addr; i--) ++ tcp_send_ack(mpcb->master_sk); ++ ++ if (master_tp->mptcp->send_mp_prio) ++ tcp_send_ack(mpcb->master_sk); ++ ++ return; ++ ++fallback: ++ rcu_read_unlock_bh(); ++ mptcp_fallback_default(mpcb); ++ return; ++} ++ ++static void full_mesh_create_subflows(struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ ++ if (mptcp_in_infinite_mapping_weak(mpcb) || ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) ++ return; ++ ++ if (mpcb->master_sk && ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) ++ return; ++ ++ if (!work_pending(&fmp->subflow_work)) { ++ sock_hold(meta_sk); ++ refcount_inc(&mpcb->mpcb_refcnt); ++ queue_work(mptcp_wq, &fmp->subflow_work); ++ } ++} ++ ++/* Called upon release_sock, if the socket was owned by the user during ++ * a path-management event. ++ */ ++static void full_mesh_release_sock(struct sock *meta_sk) ++{ ++ struct mptcp_loc_addr *mptcp_local; ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); ++ bool meta_v4 = meta_sk->sk_family == AF_INET; ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ int i; ++ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference(fm_ns->local); ++ ++ if (!meta_v4 && meta_sk->sk_ipv6only) ++ goto skip_ipv4; ++ ++ /* First, detect modifications or additions */ ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { ++ struct in_addr ifa = mptcp_local->locaddr4[i].addr; ++ bool found = false; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (sk->sk_family == AF_INET6 && ++ !mptcp_v6_is_v4_mapped(sk)) ++ continue; ++ ++ if (inet_sk(sk)->inet_saddr != ifa.s_addr) ++ continue; ++ ++ found = true; ++ ++ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) { ++ tp->mptcp->send_mp_prio = 1; ++ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio; ++ ++ tcp_send_ack(sk); ++ } ++ } ++ ++ if (!found) { ++ struct sock *sk; ++ ++ fmp->add_addr++; ++ mpcb->addr_signal = 1; ++ ++ sk = mptcp_select_ack_sock(meta_sk); ++ if (sk) ++ tcp_send_ack(sk); ++ full_mesh_create_subflows(meta_sk); ++ } ++ } ++ ++skip_ipv4: ++#if IS_ENABLED(CONFIG_IPV6) ++ /* skip IPv6 addresses if meta-socket is IPv4 */ ++ if (meta_v4) ++ goto removal; ++ ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { ++ struct in6_addr ifa = mptcp_local->locaddr6[i].addr; ++ bool found = false; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (sk->sk_family == AF_INET || ++ mptcp_v6_is_v4_mapped(sk)) ++ continue; ++ ++ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa)) ++ continue; ++ ++ found = true; ++ ++ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) { ++ tp->mptcp->send_mp_prio = 1; ++ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio; ++ ++ tcp_send_ack(sk); ++ } ++ } ++ ++ if (!found) { ++ struct sock *sk; ++ ++ fmp->add_addr++; ++ mpcb->addr_signal = 1; ++ ++ sk = mptcp_select_ack_sock(meta_sk); ++ if (sk) ++ tcp_send_ack(sk); ++ full_mesh_create_subflows(meta_sk); ++ } ++ } ++ ++removal: ++#endif ++ ++ /* Now, detect address-removals */ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ bool shall_remove = true; ++ ++ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { ++ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) { ++ shall_remove = false; ++ break; ++ } ++ } ++ } else { ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { ++ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) { ++ shall_remove = false; ++ break; ++ } ++ } ++ } ++ ++ if (shall_remove) { ++ /* Reinject, so that pf = 1 and so we ++ * won't select this one as the ++ * ack-sock. ++ */ ++ mptcp_reinject_data(sk, 0); ++ ++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, ++ meta_sk); ++ ++ mptcp_sub_force_close(sk); ++ } ++ } ++ ++ /* Just call it optimistically. It actually cannot do any harm */ ++ update_addr_bitfields(meta_sk, mptcp_local); ++ ++ rcu_read_unlock_bh(); ++} ++ ++static int full_mesh_get_local_id(const struct sock *meta_sk, ++ sa_family_t family, union inet_addr *addr, ++ bool *low_prio) ++{ ++ struct mptcp_loc_addr *mptcp_local; ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); ++ int index, id = -1; ++ ++ /* Handle the backup-flows */ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference(fm_ns->local); ++ ++ index = mptcp_find_address(mptcp_local, family, addr, 0); ++ ++ if (index != -1) { ++ if (family == AF_INET) { ++ id = mptcp_local->locaddr4[index].loc4_id; ++ *low_prio = mptcp_local->locaddr4[index].low_prio; ++ } else { ++ id = mptcp_local->locaddr6[index].loc6_id; ++ *low_prio = mptcp_local->locaddr6[index].low_prio; ++ } ++ } ++ ++ ++ rcu_read_unlock_bh(); ++ ++ return id; ++} ++ ++static void full_mesh_addr_signal(struct sock *sk, unsigned *size, ++ struct tcp_out_options *opts, ++ struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ struct sock *meta_sk = mpcb->meta_sk; ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); ++ struct mptcp_loc_addr *mptcp_local; ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); ++ int remove_addr_len; ++ u8 unannouncedv4 = 0, unannouncedv6 = 0; ++ bool meta_v4 = meta_sk->sk_family == AF_INET; ++ ++ mpcb->addr_signal = 0; ++ ++ if (likely(!fmp->add_addr)) ++ goto remove_addr; ++ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference(fm_ns->local); ++ ++ if (!meta_v4 && meta_sk->sk_ipv6only) ++ goto skip_ipv4; ++ ++ /* IPv4 */ ++ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits; ++ if (unannouncedv4 && ++ ((mpcb->mptcp_ver == MPTCP_VERSION_0 && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) || ++ (mpcb->mptcp_ver >= MPTCP_VERSION_1 && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1))) { ++ int ind = mptcp_find_free_index(~unannouncedv4); ++ ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_ADD_ADDR; ++ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id; ++ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr; ++ opts->add_addr_v4 = 1; ++ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) { ++ u8 mptcp_hash_mac[SHA256_DIGEST_SIZE]; ++ ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, ++ (u8 *)&mpcb->mptcp_rem_key, mptcp_hash_mac, 2, ++ 1, (u8 *)&mptcp_local->locaddr4[ind].loc4_id, ++ 4, (u8 *)&opts->add_addr4.addr.s_addr); ++ opts->add_addr4.trunc_mac = *(u64 *)&mptcp_hash_mac[SHA256_DIGEST_SIZE - sizeof(u64)]; ++ } ++ ++ if (skb) { ++ fmp->announced_addrs_v4 |= (1 << ind); ++ fmp->add_addr--; ++ } ++ ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) ++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; ++ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) ++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1; ++ ++ goto skip_ipv6; ++ } ++ ++ if (meta_v4) ++ goto skip_ipv6; ++skip_ipv4: ++ /* IPv6 */ ++ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits; ++ if (unannouncedv6 && ++ ((mpcb->mptcp_ver == MPTCP_VERSION_0 && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) || ++ (mpcb->mptcp_ver >= MPTCP_VERSION_1 && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1))) { ++ int ind = mptcp_find_free_index(~unannouncedv6); ++ ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_ADD_ADDR; ++ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id; ++ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr; ++ opts->add_addr_v6 = 1; ++ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) { ++ u8 mptcp_hash_mac[SHA256_DIGEST_SIZE]; ++ ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, ++ (u8 *)&mpcb->mptcp_rem_key, mptcp_hash_mac, 2, ++ 1, (u8 *)&mptcp_local->locaddr6[ind].loc6_id, ++ 16, (u8 *)&opts->add_addr6.addr.s6_addr); ++ opts->add_addr6.trunc_mac = *(u64 *)&mptcp_hash_mac[SHA256_DIGEST_SIZE - sizeof(u64)]; ++ } ++ ++ if (skb) { ++ fmp->announced_addrs_v6 |= (1 << ind); ++ fmp->add_addr--; ++ } ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) ++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; ++ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) ++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1; ++ } ++ ++skip_ipv6: ++ rcu_read_unlock_bh(); ++ ++ if (!unannouncedv4 && !unannouncedv6 && skb) ++ fmp->add_addr--; ++ ++remove_addr: ++ if (likely(!fmp->remove_addrs)) ++ goto exit; ++ ++ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs); ++ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) ++ goto exit; ++ ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_REMOVE_ADDR; ++ opts->remove_addrs = fmp->remove_addrs; ++ *size += remove_addr_len; ++ if (skb) ++ fmp->remove_addrs = 0; ++ ++exit: ++ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs); ++} ++ ++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id) ++{ ++ mptcp_v4_rem_raddress(mpcb, rem_id); ++ mptcp_v6_rem_raddress(mpcb, rem_id); ++} ++ ++static void full_mesh_delete_subflow(struct sock *sk) ++{ ++ struct fullmesh_priv *fmp = fullmesh_get_priv(tcp_sk(sk)->mpcb); ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct mptcp_loc_addr *mptcp_local; ++ int index, i; ++ ++ if (!create_on_err) ++ return; ++ ++ if (!mptcp_can_new_subflow(meta_sk)) ++ return; ++ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference_bh(fm_ns->local); ++ ++ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { ++ union inet_addr saddr; ++ ++ saddr.ip = inet_sk(sk)->inet_saddr; ++ index = mptcp_find_address(mptcp_local, AF_INET, &saddr, ++ sk->sk_bound_dev_if); ++ if (index < 0) ++ goto out; ++ ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) { ++ struct fullmesh_rem4 *rem4 = &fmp->remaddr4[i]; ++ ++ if (rem4->addr.s_addr != sk->sk_daddr) ++ continue; ++ ++ if (rem4->port && rem4->port != inet_sk(sk)->inet_dport) ++ continue; ++ ++ rem4->bitfield &= ~(1 << index); ++ } ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ union inet_addr saddr; ++ ++ saddr.in6 = inet6_sk(sk)->saddr; ++ index = mptcp_find_address(mptcp_local, AF_INET6, &saddr, ++ sk->sk_bound_dev_if); ++ if (index < 0) ++ goto out; ++ ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) { ++ struct fullmesh_rem6 *rem6 = &fmp->remaddr6[i]; ++ ++ if (!ipv6_addr_equal(&rem6->addr, &sk->sk_v6_daddr)) ++ continue; ++ ++ if (rem6->port && rem6->port != inet_sk(sk)->inet_dport) ++ continue; ++ ++ rem6->bitfield &= ~(1 << index); ++ } ++#endif ++ } ++ ++out: ++ rcu_read_unlock_bh(); ++ ++ /* re-schedule the creation of failed subflows */ ++ if (tcp_sk(sk)->mptcp->sk_err == ETIMEDOUT || sk->sk_err == ETIMEDOUT) ++ full_mesh_create_subflows(meta_sk); ++} ++ ++/* Output /proc/net/mptcp_fullmesh */ ++static int mptcp_fm_seq_show(struct seq_file *seq, void *v) ++{ ++ const struct net *net = seq->private; ++ struct mptcp_loc_addr *mptcp_local; ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net); ++ int i; ++ ++ seq_printf(seq, "Index, Address-ID, Backup, IP-address, if-idx\n"); ++ ++ rcu_read_lock_bh(); ++ mptcp_local = rcu_dereference(fm_ns->local); ++ ++ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index); ++ ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { ++ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i]; ++ ++ seq_printf(seq, "%u, %u, %u, %pI4, %u\n", i, loc4->loc4_id, ++ loc4->low_prio, &loc4->addr, loc4->if_idx); ++ } ++ ++ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index); ++ ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { ++ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i]; ++ ++ seq_printf(seq, "%u, %u, %u, %pI6, %u\n", i, loc6->loc6_id, ++ loc6->low_prio, &loc6->addr, loc6->if_idx); ++ } ++ rcu_read_unlock_bh(); ++ ++ return 0; ++} ++ ++static int mptcp_fm_init_net(struct net *net) ++{ ++ struct mptcp_loc_addr *mptcp_local; ++ struct mptcp_fm_ns *fm_ns; ++ int err = 0; ++ ++ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL); ++ if (!fm_ns) ++ return -ENOBUFS; ++ ++ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL); ++ if (!mptcp_local) { ++ err = -ENOBUFS; ++ goto err_mptcp_local; ++ } ++ ++ if (!proc_create_net_single("mptcp_fullmesh", S_IRUGO, net->proc_net, ++ mptcp_fm_seq_show, NULL)) { ++ err = -ENOMEM; ++ goto err_seq_fops; ++ } ++ ++ mptcp_local->next_v4_index = 1; ++ ++ rcu_assign_pointer(fm_ns->local, mptcp_local); ++ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker); ++ INIT_LIST_HEAD(&fm_ns->events); ++ spin_lock_init(&fm_ns->local_lock); ++ fm_ns->net = net; ++ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns; ++ ++ return 0; ++err_seq_fops: ++ kfree(mptcp_local); ++err_mptcp_local: ++ kfree(fm_ns); ++ return err; ++} ++ ++static void mptcp_fm_exit_net(struct net *net) ++{ ++ struct mptcp_addr_event *eventq, *tmp; ++ struct mptcp_fm_ns *fm_ns; ++ struct mptcp_loc_addr *mptcp_local; ++ ++ fm_ns = fm_get_ns(net); ++ cancel_delayed_work_sync(&fm_ns->address_worker); ++ ++ rcu_read_lock_bh(); ++ ++ mptcp_local = rcu_dereference_bh(fm_ns->local); ++ kfree_rcu(mptcp_local, rcu); ++ ++ spin_lock(&fm_ns->local_lock); ++ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) { ++ list_del(&eventq->list); ++ kfree(eventq); ++ } ++ spin_unlock(&fm_ns->local_lock); ++ ++ rcu_read_unlock_bh(); ++ ++ remove_proc_entry("mptcp_fullmesh", net->proc_net); ++ ++ kfree(fm_ns); ++} ++ ++static struct pernet_operations full_mesh_net_ops = { ++ .init = mptcp_fm_init_net, ++ .exit = mptcp_fm_exit_net, ++}; ++ ++static struct mptcp_pm_ops full_mesh __read_mostly = { ++ .new_session = full_mesh_new_session, ++ .release_sock = full_mesh_release_sock, ++ .fully_established = full_mesh_create_subflows, ++ .new_remote_address = full_mesh_create_subflows, ++ .get_local_id = full_mesh_get_local_id, ++ .addr_signal = full_mesh_addr_signal, ++ .add_raddr = full_mesh_add_raddr, ++ .rem_raddr = full_mesh_rem_raddr, ++ .delete_subflow = full_mesh_delete_subflow, ++ .name = "fullmesh", ++ .owner = THIS_MODULE, ++}; ++ ++/* General initialization of MPTCP_PM */ ++static int __init full_mesh_register(void) ++{ ++ int ret; ++ ++ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE); ++ ++ ret = register_pernet_subsys(&full_mesh_net_ops); ++ if (ret) ++ goto out; ++ ++ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); ++ if (ret) ++ goto err_reg_inetaddr; ++ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier); ++ if (ret) ++ goto err_reg_netdev; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ ret = register_inet6addr_notifier(&inet6_addr_notifier); ++ if (ret) ++ goto err_reg_inet6addr; ++#endif ++ ++ ret = mptcp_register_path_manager(&full_mesh); ++ if (ret) ++ goto err_reg_pm; ++ ++out: ++ return ret; ++ ++ ++err_reg_pm: ++#if IS_ENABLED(CONFIG_IPV6) ++ unregister_inet6addr_notifier(&inet6_addr_notifier); ++err_reg_inet6addr: ++#endif ++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); ++err_reg_netdev: ++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); ++err_reg_inetaddr: ++ unregister_pernet_subsys(&full_mesh_net_ops); ++ goto out; ++} ++ ++static void full_mesh_unregister(void) ++{ ++#if IS_ENABLED(CONFIG_IPV6) ++ unregister_inet6addr_notifier(&inet6_addr_notifier); ++#endif ++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); ++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); ++ unregister_pernet_subsys(&full_mesh_net_ops); ++ mptcp_unregister_path_manager(&full_mesh); ++} ++ ++module_init(full_mesh_register); ++module_exit(full_mesh_unregister); ++ ++MODULE_AUTHOR("Christoph Paasch"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("Full-Mesh MPTCP"); ++MODULE_VERSION("0.88"); +diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c +new file mode 100644 +index 000000000000..ae9cc7209613 +--- /dev/null ++++ b/net/mptcp/mptcp_input.c +@@ -0,0 +1,2546 @@ ++/* ++ * MPTCP implementation - Sending side ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++/* is seq1 < seq2 ? */ ++static inline bool before64(const u64 seq1, const u64 seq2) ++{ ++ return (s64)(seq1 - seq2) < 0; ++} ++ ++/* is seq1 > seq2 ? */ ++#define after64(seq1, seq2) before64(seq2, seq1) ++ ++static inline void mptcp_become_fully_estab(struct sock *sk) ++{ ++ tcp_sk(sk)->mptcp->fully_established = 1; ++ ++ if (is_master_tp(tcp_sk(sk)) && ++ tcp_sk(sk)->mpcb->pm_ops->fully_established) ++ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk)); ++} ++ ++/* Similar to tcp_tso_acked without any memory accounting */ ++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk, ++ struct sk_buff *skb) ++{ ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ u32 packets_acked, len, delta_truesize; ++ ++ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)); ++ ++ packets_acked = tcp_skb_pcount(skb); ++ ++ if (skb_unclone(skb, GFP_ATOMIC)) ++ return 0; ++ ++ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq; ++ delta_truesize = __pskb_trim_head(skb, len); ++ ++ TCP_SKB_CB(skb)->seq += len; ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ ++ if (delta_truesize) ++ skb->truesize -= delta_truesize; ++ ++ /* Any change of skb->len requires recalculation of tso factor. */ ++ if (tcp_skb_pcount(skb) > 1) ++ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); ++ packets_acked -= tcp_skb_pcount(skb); ++ ++ if (packets_acked) { ++ BUG_ON(tcp_skb_pcount(skb) == 0); ++ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); ++ } ++ ++ return packets_acked; ++} ++ ++/* Cleans the meta-socket retransmission queue and the reinject-queue. */ ++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) ++{ ++ struct sk_buff *skb, *tmp, *next; ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ bool fully_acked = true; ++ bool acked = false; ++ u32 acked_pcount; ++ ++ for (skb = skb_rb_first(&meta_sk->tcp_rtx_queue); skb; skb = next) { ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ ++ tcp_ack_tstamp(meta_sk, skb, prior_snd_una); ++ ++ if (after(scb->end_seq, meta_tp->snd_una)) { ++ if (tcp_skb_pcount(skb) == 1 || ++ !after(meta_tp->snd_una, scb->seq)) ++ break; ++ ++ acked_pcount = tcp_tso_acked(meta_sk, skb); ++ if (!acked_pcount) ++ break; ++ fully_acked = false; ++ } else { ++ acked_pcount = tcp_skb_pcount(skb); ++ } ++ ++ acked = true; ++ meta_tp->packets_out -= acked_pcount; ++ meta_tp->retrans_stamp = 0; ++ ++ if (!fully_acked) ++ break; ++ ++ next = skb_rb_next(skb); ++ ++ if (mptcp_is_data_fin(skb)) { ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ /* DATA_FIN has been acknowledged - now we can close ++ * the subflows ++ */ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ unsigned long delay = 0; ++ ++ /* If we are the passive closer, don't trigger ++ * subflow-fin until the subflow has been finned ++ * by the peer - thus we add a delay. ++ */ ++ if (mpcb->passive_close && ++ sk_it->sk_state == TCP_ESTABLISHED) ++ delay = inet_csk(sk_it)->icsk_rto << 3; ++ ++ mptcp_sub_close(sk_it, delay); ++ } ++ } ++ tcp_rtx_queue_unlink_and_free(skb, meta_sk); ++ } ++ /* Remove acknowledged data from the reinject queue */ ++ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { ++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { ++ if (tcp_skb_pcount(skb) == 1 || ++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) ++ break; ++ ++ mptcp_tso_acked_reinject(meta_sk, skb); ++ break; ++ } ++ ++ __skb_unlink(skb, &mpcb->reinject_queue); ++ __kfree_skb(skb); ++ } ++ ++ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) ++ meta_tp->snd_up = meta_tp->snd_una; ++ ++ if (acked) { ++ tcp_rearm_rto(meta_sk); ++ /* Normally this is done in tcp_try_undo_loss - but MPTCP ++ * does not call this function. ++ */ ++ inet_csk(meta_sk)->icsk_retransmits = 0; ++ } ++} ++ ++/* Inspired by tcp_rcv_state_process */ ++/* Returns 0 if processing the packet can continue ++ * -1 if connection was closed with an active reset ++ * 1 if connection was closed and processing should stop. ++ */ ++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, ++ const struct sk_buff *skb, u32 data_seq, ++ u16 data_len) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); ++ const struct tcphdr *th = tcp_hdr(skb); ++ ++ /* State-machine handling if FIN has been enqueued and he has ++ * been acked (snd_una == write_seq) - it's important that this ++ * here is after sk_wmem_free_skb because otherwise ++ * sk_forward_alloc is wrong upon inet_csk_destroy_sock() ++ */ ++ switch (meta_sk->sk_state) { ++ case TCP_FIN_WAIT1: { ++ struct dst_entry *dst; ++ int tmo; ++ ++ if (meta_tp->snd_una != meta_tp->write_seq) ++ break; ++ ++ tcp_set_state(meta_sk, TCP_FIN_WAIT2); ++ meta_sk->sk_shutdown |= SEND_SHUTDOWN; ++ ++ dst = __sk_dst_get(sk); ++ if (dst) ++ dst_confirm(dst); ++ ++ if (!sock_flag(meta_sk, SOCK_DEAD)) { ++ /* Wake up lingering close() */ ++ meta_sk->sk_state_change(meta_sk); ++ break; ++ } ++ ++ if (meta_tp->linger2 < 0 || ++ (data_len && ++ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), ++ meta_tp->rcv_nxt))) { ++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); ++ tcp_done(meta_sk); ++ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); ++ return -1; ++ } ++ ++ tmo = tcp_fin_time(meta_sk); ++ if (tmo > TCP_TIMEWAIT_LEN) { ++ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); ++ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) { ++ /* Bad case. We could lose such FIN otherwise. ++ * It is not a big problem, but it looks confusing ++ * and not so rare event. We still can lose it now, ++ * if it spins in bh_lock_sock(), but it is really ++ * marginal case. ++ */ ++ inet_csk_reset_keepalive_timer(meta_sk, tmo); ++ } else { ++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo); ++ } ++ break; ++ } ++ case TCP_CLOSING: ++ case TCP_LAST_ACK: ++ if (meta_tp->snd_una == meta_tp->write_seq) { ++ tcp_done(meta_sk); ++ return 1; ++ } ++ break; ++ } ++ ++ /* step 7: process the segment text */ ++ switch (meta_sk->sk_state) { ++ case TCP_FIN_WAIT1: ++ case TCP_FIN_WAIT2: ++ /* RFC 793 says to queue data in these states, ++ * RFC 1122 says we MUST send a reset. ++ * BSD 4.4 also does reset. ++ */ ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { ++ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && ++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && ++ !mptcp_is_data_fin2(skb, tp)) { ++ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); ++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); ++ tcp_reset(meta_sk); ++ return -1; ++ } ++ } ++ break; ++ } ++ ++ return 0; ++} ++ ++/** ++ * @return: ++ * i) 1: Everything's fine. ++ * ii) -1: A reset has been sent on the subflow - csum-failure ++ * iii) 0: csum-failure but no reset sent, because it's the last subflow. ++ * Last packet should not be destroyed by the caller because it has ++ * been done here. ++ */ ++static int mptcp_verif_dss_csum(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sk_buff *tmp, *tmp1, *last = NULL; ++ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ ++ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; ++ int iter = 0; ++ u32 next_seq, offset_seq; ++ ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { ++ unsigned int csum_len; ++ ++ /* init next seq in first round */ ++ if (!iter) ++ next_seq = TCP_SKB_CB(tmp)->seq; ++ offset_seq = next_seq - TCP_SKB_CB(tmp)->seq; ++ ++ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) ++ /* Mapping ends in the middle of the packet - ++ * csum only these bytes ++ */ ++ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; ++ else ++ csum_len = tmp->len; ++ ++ csum_len -= offset_seq; ++ offset = 0; ++ if (overflowed) { ++ char first_word[4]; ++ first_word[0] = 0; ++ first_word[1] = 0; ++ first_word[2] = 0; ++ first_word[3] = *(tmp->data + offset_seq); ++ csum_tcp = csum_partial(first_word, 4, csum_tcp); ++ offset = 1; ++ csum_len--; ++ overflowed = 0; ++ } ++ ++ csum_tcp = skb_checksum(tmp, offset + offset_seq, csum_len, ++ csum_tcp); ++ ++ /* Was it on an odd-length? Then we have to merge the next byte ++ * correctly (see above) ++ */ ++ if (csum_len != (csum_len & (~1))) ++ overflowed = 1; ++ ++ if (mptcp_is_data_seq(tmp) && !dss_csum_added) { ++ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); ++ ++ /* If a 64-bit dss is present, we increase the offset ++ * by 4 bytes, as the high-order 64-bits will be added ++ * in the final csum_partial-call. ++ */ ++ u32 offset = skb_transport_offset(tmp) + ++ TCP_SKB_CB(tmp)->dss_off; ++ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) ++ offset += 4; ++ ++ csum_tcp = skb_checksum(tmp, offset, ++ MPTCP_SUB_LEN_SEQ_CSUM, ++ csum_tcp); ++ ++ csum_tcp = csum_partial(&data_seq, ++ sizeof(data_seq), csum_tcp); ++ ++ dss_csum_added = 1; /* Just do it once */ ++ } else if (mptcp_is_data_mpcapable(tmp) && !dss_csum_added) { ++ u32 offset = skb_transport_offset(tmp) + TCP_SKB_CB(tmp)->dss_off; ++ __be64 data_seq = htonll(tp->mptcp->map_data_seq); ++ __be32 rel_seq = htonl(tp->mptcp->map_subseq - tp->mptcp->rcv_isn); ++ ++ csum_tcp = csum_partial(&data_seq, sizeof(data_seq), csum_tcp); ++ csum_tcp = csum_partial(&rel_seq, sizeof(rel_seq), csum_tcp); ++ ++ csum_tcp = skb_checksum(tmp, offset, 4, csum_tcp); ++ ++ dss_csum_added = 1; ++ } ++ last = tmp; ++ iter++; ++ ++ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && ++ !before(TCP_SKB_CB(tmp1)->seq, ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) ++ break; ++ next_seq = TCP_SKB_CB(tmp)->end_seq; ++ } ++ ++ /* Now, checksum must be 0 */ ++ if (unlikely(csum_fold(csum_tcp))) { ++ struct mptcp_tcp_sock *mptcp; ++ struct sock *sk_it = NULL; ++ ++ pr_debug("%s csum is wrong: %#x tcp-seq %u dss_csum_added %d overflowed %d iterations %d\n", ++ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq, ++ dss_csum_added, overflowed, iter); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMFAIL); ++ tp->mptcp->send_mp_fail = 1; ++ ++ /* map_data_seq is the data-seq number of the ++ * mapping we are currently checking ++ */ ++ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; ++ ++ /* Search for another subflow that is fully established */ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ sk_it = mptcp_to_sock(mptcp); ++ ++ if (sk_it != sk && ++ tcp_sk(sk_it)->mptcp->fully_established) ++ break; ++ ++ sk_it = NULL; ++ } ++ ++ if (sk_it) { ++ mptcp_send_reset(sk); ++ ans = -1; ++ } else { ++ tp->mpcb->send_infinite_mapping = 1; ++ ++ /* Need to purge the rcv-queue as it's no more valid */ ++ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { ++ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; ++ kfree_skb(tmp); ++ } ++ ++ mptcp_fallback_close(tp->mpcb, sk); ++ ++ ans = 0; ++ } ++ } ++ ++ return ans; ++} ++ ++static inline void mptcp_prepare_skb(struct sk_buff *skb, ++ const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ++ u32 inc = 0, end_seq = tcb->end_seq; ++ ++ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ++ end_seq--; ++ /* If skb is the end of this mapping (end is always at mapping-boundary ++ * thanks to the splitting/trimming), then we need to increase ++ * data-end-seq by 1 if this here is a data-fin. ++ * ++ * We need to do -1 because end_seq includes the subflow-FIN. ++ */ ++ if (tp->mptcp->map_data_fin && ++ end_seq == tp->mptcp->map_subseq + tp->mptcp->map_data_len) { ++ inc = 1; ++ ++ /* We manually set the fin-flag if it is a data-fin. For easy ++ * processing in tcp_recvmsg. ++ */ ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; ++ } else { ++ /* We may have a subflow-fin with data but without data-fin */ ++ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_FIN; ++ } ++ ++ /* Adapt data-seq's to the packet itself. We kinda transform the ++ * dss-mapping to a per-packet granularity. This is necessary to ++ * correctly handle overlapping mappings coming from different ++ * subflows. Otherwise it would be a complete mess. ++ */ ++ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; ++ tcb->end_seq = tcb->seq + skb->len + inc; ++} ++ ++static inline void mptcp_reset_mapping(struct tcp_sock *tp, u32 old_copied_seq) ++{ ++ tp->mptcp->map_data_len = 0; ++ tp->mptcp->map_data_seq = 0; ++ tp->mptcp->map_subseq = 0; ++ tp->mptcp->map_data_fin = 0; ++ tp->mptcp->mapping_present = 0; ++ ++ /* In infinite mapping receiver mode, we have to advance the implied ++ * data-sequence number when we progress the subflow's data. ++ */ ++ if (tp->mpcb->infinite_mapping_rcv) ++ tp->mpcb->infinite_rcv_seq += (tp->copied_seq - old_copied_seq); ++} ++ ++/* The DSS-mapping received on the sk only covers the second half of the skb ++ * (cut at seq). We trim the head from the skb. ++ * Data will be freed upon kfree(). ++ * ++ * Inspired by tcp_trim_head(). ++ */ ++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) ++{ ++ int len = seq - TCP_SKB_CB(skb)->seq; ++ u32 new_seq = TCP_SKB_CB(skb)->seq + len; ++ u32 delta_truesize; ++ ++ delta_truesize = __pskb_trim_head(skb, len); ++ ++ TCP_SKB_CB(skb)->seq = new_seq; ++ ++ if (delta_truesize) { ++ skb->truesize -= delta_truesize; ++ atomic_sub(delta_truesize, &sk->sk_rmem_alloc); ++ sk_mem_uncharge(sk, delta_truesize); ++ } ++} ++ ++/* The DSS-mapping received on the sk only covers the first half of the skb ++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue ++ * as further packets may resolve the mapping of the second half of data. ++ * ++ * Inspired by tcp_fragment(). ++ */ ++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) ++{ ++ struct sk_buff *buff; ++ int nsize; ++ int nlen, len; ++ u8 flags; ++ ++ len = seq - TCP_SKB_CB(skb)->seq; ++ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; ++ if (nsize < 0) ++ nsize = 0; ++ ++ /* Get a new skb... force flag on. */ ++ buff = alloc_skb(nsize, GFP_ATOMIC); ++ if (buff == NULL) ++ return -ENOMEM; ++ ++ skb_reserve(buff, tcp_sk(sk)->tcp_header_len); ++ skb_reset_transport_header(buff); ++ ++ flags = TCP_SKB_CB(skb)->tcp_flags; ++ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN); ++ TCP_SKB_CB(buff)->tcp_flags = flags; ++ ++ /* We absolutly need to call skb_set_owner_r before refreshing the ++ * truesize of buff, otherwise the moved data will account twice. ++ */ ++ skb_set_owner_r(buff, sk); ++ nlen = skb->len - len - nsize; ++ buff->truesize += nlen; ++ skb->truesize -= nlen; ++ ++ /* Correct the sequence numbers. */ ++ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; ++ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; ++ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; ++ ++ skb_split(skb, buff, len); ++ ++ __skb_queue_after(&sk->sk_receive_queue, skb, buff); ++ ++ return 0; ++} ++ ++/* @return: 0 everything is fine. Just continue processing ++ * 1 subflow is broken stop everything ++ * -1 this packet was broken - continue with the next one. ++ */ ++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ ++ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ ++ if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && ++ !mptcp_is_data_fin(skb) && !mpcb->infinite_mapping_rcv) { ++ /* Remove a pure subflow-fin from the queue and increase ++ * copied_seq. ++ */ ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; ++ __skb_unlink(skb, &sk->sk_receive_queue); ++ __kfree_skb(skb); ++ return -1; ++ } ++ ++ /* If we are not yet fully established and do not know the mapping for ++ * this segment, this path has to fallback to infinite or be torn down. ++ */ ++ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && ++ !mptcp_is_data_mpcapable(skb) && ++ !tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) { ++ pr_debug("%s %#x will fallback - pi %d from %pS, seq %u mptcp-flags %#x\n", ++ __func__, mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, __builtin_return_address(0), ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->mptcp_flags); ++ ++ if (!is_master_tp(tp)) { ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB); ++ mptcp_send_reset(sk); ++ return 1; ++ } ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATAINIT); ++ ++ mpcb->infinite_mapping_snd = 1; ++ mpcb->infinite_mapping_rcv = 1; ++ mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); ++ ++ mptcp_fallback_close(mpcb, sk); ++ ++ /* We do a seamless fallback and should not send a inf.mapping. */ ++ mpcb->send_infinite_mapping = 0; ++ tp->mptcp->fully_established = 1; ++ } ++ ++ /* Receiver-side becomes fully established when a whole rcv-window has ++ * been received without the need to fallback due to the previous ++ * condition. ++ */ ++ if (!tp->mptcp->fully_established) { ++ tp->mptcp->init_rcv_wnd -= skb->len; ++ if (tp->mptcp->init_rcv_wnd < 0) ++ mptcp_become_fully_estab(sk); ++ } ++ ++ return 0; ++} ++ ++static void mptcp_restart_sending(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct sk_buff *wq_head, *skb, *tmp; ++ ++ skb = tcp_rtx_queue_head(meta_sk); ++ ++ /* We resend everything that has not been acknowledged, thus we need ++ * to move it from the rtx-tree to the write-queue. ++ */ ++ wq_head = tcp_write_queue_head(meta_sk); ++ ++ skb_rbtree_walk_from_safe(skb, tmp) { ++ list_del(&skb->tcp_tsorted_anchor); ++ tcp_rtx_queue_unlink(skb, meta_sk); ++ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); ++ ++ if (wq_head) ++ __skb_queue_before(&meta_sk->sk_write_queue, wq_head, skb); ++ else ++ tcp_add_write_queue_tail(meta_sk, skb); ++ } ++ ++ /* We artificially restart the whole send-queue. Thus, ++ * it is as if no packets are in flight ++ */ ++ meta_tp->packets_out = 0; ++ ++ /* If the snd_nxt already wrapped around, we have to ++ * undo the wrapping, as we are restarting from snd_una ++ * on. ++ */ ++ if (meta_tp->snd_nxt < meta_tp->snd_una) { ++ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; ++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; ++ } ++ meta_tp->snd_nxt = meta_tp->snd_una; ++ ++ /* Trigger a sending on the meta. */ ++ mptcp_push_pending_frames(meta_sk); ++} ++ ++/* @return: 0 everything is fine. Just continue processing ++ * 1 subflow is broken stop everything ++ * -1 this packet was broken - continue with the next one. ++ */ ++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ++ u32 *ptr; ++ u32 data_seq, sub_seq, data_len, tcp_end_seq; ++ bool set_infinite_rcv = false; ++ ++ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be ++ * in-order at the data-level. Thus data-seq-numbers can be inferred ++ * from what is expected at the data-level. ++ */ ++ if (mpcb->infinite_mapping_rcv) { ++ /* copied_seq may be bigger than tcb->seq (e.g., when the peer ++ * retransmits data that actually has already been acknowledged with ++ * newer data, if he did not receive our acks). Thus, we need ++ * to account for this overlap as well. ++ */ ++ tp->mptcp->map_data_seq = mpcb->infinite_rcv_seq - (tp->copied_seq - tcb->seq); ++ tp->mptcp->map_subseq = tcb->seq; ++ tp->mptcp->map_data_len = skb->len; ++ tp->mptcp->map_data_fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); ++ tp->mptcp->mapping_present = 1; ++ return 0; ++ } ++ ++ if (!tp->mptcp->mapping_present && mptcp_is_data_mpcapable(skb)) { ++ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); ++ ++ sub_seq = 1 + tp->mptcp->rcv_isn; ++ data_seq = meta_tp->rcv_nxt; ++ data_len = get_unaligned_be16(ptr); ++ } else if (!mptcp_is_data_seq(skb)) { ++ /* No mapping here? ++ * Exit - it is either already set or still on its way ++ */ ++ if (!tp->mptcp->mapping_present && ++ tp->rcv_nxt - tp->copied_seq > 65536) { ++ /* Too many packets without a mapping, ++ * this subflow is broken ++ */ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); ++ mptcp_send_reset(sk); ++ return 1; ++ } ++ ++ return 0; ++ } else { ++ /* Well, then the DSS-mapping is there. So, read it! */ ++ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); ++ ptr++; ++ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; ++ ptr++; ++ data_len = get_unaligned_be16(ptr); ++ } ++ ++ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. ++ * The draft sets it to 0, but we really would like to have the ++ * real value, to have an easy handling afterwards here in this ++ * function. ++ */ ++ if (mptcp_is_data_fin(skb) && skb->len == 0) ++ sub_seq = TCP_SKB_CB(skb)->seq; ++ ++ /* If there is already a mapping - we check if it maps with the current ++ * one. If not - we reset. ++ */ ++ if (tp->mptcp->mapping_present && ++ (data_seq != (u32)tp->mptcp->map_data_seq || ++ sub_seq != tp->mptcp->map_subseq || ++ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || ++ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { ++ /* Mapping in packet is different from what we want */ ++ pr_debug("%s Mappings do not match!\n", __func__); ++ pr_debug("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", ++ __func__, data_seq, (u32)tp->mptcp->map_data_seq, ++ sub_seq, tp->mptcp->map_subseq, data_len, ++ tp->mptcp->map_data_len, mptcp_is_data_fin(skb), ++ tp->mptcp->map_data_fin); ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSNOMATCH); ++ mptcp_send_reset(sk); ++ return 1; ++ } ++ ++ /* If the previous check was good, the current mapping is valid and we exit. */ ++ if (tp->mptcp->mapping_present) ++ return 0; ++ ++ /* Mapping not yet set on this subflow - we set it here! */ ++ ++ if (!data_len) { ++ mpcb->infinite_mapping_rcv = 1; ++ mpcb->send_infinite_mapping = 1; ++ tp->mptcp->fully_established = 1; ++ /* We need to repeat mp_fail's until the sender felt ++ * back to infinite-mapping - here we stop repeating it. ++ */ ++ tp->mptcp->send_mp_fail = 0; ++ ++ /* We have to fixup data_len - it must be the same as skb->len */ ++ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); ++ sub_seq = tcb->seq; ++ ++ mptcp_restart_sending(tp->meta_sk); ++ ++ mptcp_fallback_close(mpcb, sk); ++ ++ /* data_seq and so on are set correctly */ ++ ++ /* At this point, the meta-ofo-queue has to be emptied, ++ * as the following data is guaranteed to be in-order at ++ * the data and subflow-level ++ */ ++ skb_rbtree_purge(&meta_tp->out_of_order_queue); ++ ++ set_infinite_rcv = true; ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_INFINITEMAPRX); ++ } ++ ++ /* We are sending mp-fail's and thus are in fallback mode. ++ * Ignore packets which do not announce the fallback and still ++ * want to provide a mapping. ++ */ ++ if (tp->mptcp->send_mp_fail) { ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; ++ __skb_unlink(skb, &sk->sk_receive_queue); ++ __kfree_skb(skb); ++ return -1; ++ } ++ ++ /* FIN increased the mapping-length by 1 */ ++ if (mptcp_is_data_fin(skb)) ++ data_len--; ++ ++ /* Subflow-sequences of packet must be ++ * (at least partially) be part of the DSS-mapping's ++ * subflow-sequence-space. ++ * ++ * Basically the mapping is not valid, if either of the ++ * following conditions is true: ++ * ++ * 1. It's not a data_fin and ++ * MPTCP-sub_seq >= TCP-end_seq ++ * ++ * 2. It's a data_fin and TCP-end_seq > TCP-seq and ++ * MPTCP-sub_seq >= TCP-end_seq ++ * ++ * The previous two can be merged into: ++ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq ++ * Because if it's not a data-fin, TCP-end_seq > TCP-seq ++ * ++ * 3. It's a data_fin and skb->len == 0 and ++ * MPTCP-sub_seq > TCP-end_seq ++ * ++ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and ++ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq ++ */ ++ ++ /* subflow-fin is not part of the mapping - ignore it here ! */ ++ tcp_end_seq = tcb->end_seq; ++ if (tcb->tcp_flags & TCPHDR_FIN) ++ tcp_end_seq--; ++ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || ++ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || ++ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq))) { ++ /* Subflow-sequences of packet is different from what is in the ++ * packet's dss-mapping. The peer is misbehaving - reset ++ */ ++ pr_debug("%s Packet's mapping does not map to the DSS sub_seq %u end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u copied_seq %u\n", ++ __func__, sub_seq, tcb->end_seq, tcp_end_seq, ++ tcb->seq, mptcp_is_data_fin(skb), ++ skb->len, data_len, tp->copied_seq); ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTCPMISMATCH); ++ mptcp_send_reset(sk); ++ return 1; ++ } ++ ++ /* Does the DSS had 64-bit seqnum's ? */ ++ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { ++ /* Wrapped around? */ ++ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); ++ } else { ++ /* Else, access the default high-order bits */ ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); ++ } ++ } else { ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); ++ ++ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { ++ /* We make sure that the data_seq is invalid. ++ * It will be dropped later. ++ */ ++ tp->mptcp->map_data_seq += 0xFFFFFFFF; ++ tp->mptcp->map_data_seq += 0xFFFFFFFF; ++ } ++ } ++ ++ if (set_infinite_rcv) ++ mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq; ++ ++ tp->mptcp->map_data_len = data_len; ++ tp->mptcp->map_subseq = sub_seq; ++ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; ++ tp->mptcp->mapping_present = 1; ++ ++ return 0; ++} ++ ++/* Similar to tcp_sequence(...) */ ++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp, ++ u64 data_seq, u64 end_data_seq) ++{ ++ const struct mptcp_cb *mpcb = meta_tp->mpcb; ++ u64 rcv_wup64; ++ ++ /* Wrap-around? */ ++ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { ++ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | ++ meta_tp->rcv_wup; ++ } else { ++ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, ++ meta_tp->rcv_wup); ++ } ++ ++ return !before64(end_data_seq, rcv_wup64) && ++ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window_now(meta_tp)); ++} ++ ++/* @return: 0 everything is fine. Just continue processing ++ * -1 this packet was broken - continue with the next one. ++ */ ++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sk_buff *tmp, *tmp1; ++ u32 tcp_end_seq; ++ ++ if (!tp->mptcp->mapping_present) ++ return 0; ++ ++ /* either, the new skb gave us the mapping and the first segment ++ * in the sub-rcv-queue has to be trimmed ... ++ */ ++ tmp = skb_peek(&sk->sk_receive_queue); ++ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && ++ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) { ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTRIMHEAD); ++ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); ++ } ++ ++ /* ... or the new skb (tail) has to be split at the end. */ ++ tcp_end_seq = TCP_SKB_CB(skb)->end_seq; ++ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ++ tcp_end_seq--; ++ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { ++ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSSPLITTAIL); ++ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ ++ /* TODO : maybe handle this here better. ++ * We now just force meta-retransmission. ++ */ ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq; ++ __skb_unlink(skb, &sk->sk_receive_queue); ++ __kfree_skb(skb); ++ return -1; ++ } ++ } ++ ++ /* Now, remove old sk_buff's from the receive-queue. ++ * This may happen if the mapping has been lost for these segments and ++ * the next mapping has already been received. ++ */ ++ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { ++ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) ++ break; ++ ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; ++ __skb_unlink(tmp1, &sk->sk_receive_queue); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PURGEOLD); ++ /* Impossible that we could free skb here, because his ++ * mapping is known to be valid from previous checks ++ */ ++ __kfree_skb(tmp1); ++ } ++ } ++ ++ return 0; ++} ++ ++/* @return: 0 everything is fine. Just continue processing ++ * 1 subflow is broken stop everything ++ * -1 this mapping has been put in the meta-receive-queue ++ * -2 this mapping has been eaten by the application ++ */ ++static int mptcp_queue_skb(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ struct sk_buff *tmp, *tmp1; ++ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); ++ u32 old_copied_seq = tp->copied_seq; ++ bool data_queued = false; ++ ++ /* Have we not yet received the full mapping? */ ++ if (!tp->mptcp->mapping_present || ++ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) ++ return 0; ++ ++ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq ++ * OR ++ * This mapping is out of window ++ */ ++ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || ++ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, ++ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { ++ __skb_unlink(tmp1, &sk->sk_receive_queue); ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; ++ __kfree_skb(tmp1); ++ ++ if (!skb_queue_empty(&sk->sk_receive_queue) && ++ !before(TCP_SKB_CB(tmp)->seq, ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) ++ break; ++ } ++ ++ mptcp_reset_mapping(tp, old_copied_seq); ++ ++ return -1; ++ } ++ ++ /* Record it, because we want to send our data_fin on the same path */ ++ if (tp->mptcp->map_data_fin) { ++ mpcb->dfin_path_index = tp->mptcp->path_index; ++ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); ++ } ++ ++ /* Verify the checksum */ ++ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { ++ int ret = mptcp_verif_dss_csum(sk); ++ ++ if (ret <= 0) { ++ mptcp_reset_mapping(tp, old_copied_seq); ++ return 1; ++ } ++ } ++ ++ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { ++ /* Seg's have to go to the meta-ofo-queue */ ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; ++ mptcp_prepare_skb(tmp1, sk); ++ __skb_unlink(tmp1, &sk->sk_receive_queue); ++ /* MUST be done here, because fragstolen may be true later. ++ * Then, kfree_skb_partial will not account the memory. ++ */ ++ skb_orphan(tmp1); ++ ++ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ ++ tcp_data_queue_ofo(meta_sk, tmp1); ++ else ++ __kfree_skb(tmp1); ++ ++ if (!skb_queue_empty(&sk->sk_receive_queue) && ++ !before(TCP_SKB_CB(tmp)->seq, ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) ++ break; ++ } ++ ++ /* Quick ACK if more 3/4 of the receive window is filled */ ++ if (after64(tp->mptcp->map_data_seq, ++ rcv_nxt64 + 3 * (tcp_receive_window_now(meta_tp) >> 2))) ++ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); ++ ++ } else { ++ /* Ready for the meta-rcv-queue */ ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { ++ int eaten = 0; ++ bool fragstolen = false; ++ u32 old_rcv_nxt = meta_tp->rcv_nxt; ++ ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; ++ mptcp_prepare_skb(tmp1, sk); ++ __skb_unlink(tmp1, &sk->sk_receive_queue); ++ /* MUST be done here, because fragstolen may be true. ++ * Then, kfree_skb_partial will not account the memory. ++ */ ++ skb_orphan(tmp1); ++ ++ /* This segment has already been received */ ++ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { ++ __kfree_skb(tmp1); ++ goto next; ++ } ++ ++ if (mpcb->in_time_wait) /* In time-wait, do not receive data */ ++ eaten = 1; ++ ++ if (!eaten) ++ eaten = tcp_queue_rcv(meta_sk, tmp1, &fragstolen); ++ ++ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; ++ ++ if (TCP_SKB_CB(tmp1)->tcp_flags & TCPHDR_FIN) ++ mptcp_fin(meta_sk); ++ ++ /* Check if this fills a gap in the ofo queue */ ++ if (!RB_EMPTY_ROOT(&meta_tp->out_of_order_queue)) ++ tcp_ofo_queue(meta_sk); ++ ++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); ++ ++ if (eaten) ++ kfree_skb_partial(tmp1, fragstolen); ++ ++ data_queued = true; ++next: ++ if (!skb_queue_empty(&sk->sk_receive_queue) && ++ !before(TCP_SKB_CB(tmp)->seq, ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len)) ++ break; ++ } ++ } ++ ++ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_jiffies32; ++ mptcp_reset_mapping(tp, old_copied_seq); ++ ++ return data_queued ? -1 : -2; ++} ++ ++void mptcp_data_ready(struct sock *sk) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct sk_buff *skb, *tmp; ++ int queued = 0; ++ ++ tcp_mstamp_refresh(tcp_sk(meta_sk)); ++ ++ /* restart before the check, because mptcp_fin might have changed the ++ * state. ++ */ ++restart: ++ /* If the meta cannot receive data, there is no point in pushing data. ++ * If we are in time-wait, we may still be waiting for the final FIN. ++ * So, we should proceed with the processing. ++ */ ++ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) { ++ skb_queue_purge(&sk->sk_receive_queue); ++ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; ++ goto exit; ++ } ++ ++ /* Iterate over all segments, detect their mapping (if we don't have ++ * one yet), validate them and push everything one level higher. ++ */ ++ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { ++ int ret; ++ /* Pre-validation - e.g., early fallback */ ++ ret = mptcp_prevalidate_skb(sk, skb); ++ if (ret < 0) ++ goto restart; ++ else if (ret > 0) ++ break; ++ ++ /* Set the current mapping */ ++ ret = mptcp_detect_mapping(sk, skb); ++ if (ret < 0) ++ goto restart; ++ else if (ret > 0) ++ break; ++ ++ /* Validation */ ++ if (mptcp_validate_mapping(sk, skb) < 0) ++ goto restart; ++ ++ /* Push a level higher */ ++ ret = mptcp_queue_skb(sk); ++ if (ret < 0) { ++ if (ret == -1) ++ queued = ret; ++ goto restart; ++ } else if (ret == 0) { ++ continue; ++ } else { /* ret == 1 */ ++ break; ++ } ++ } ++ ++exit: ++ if (tcp_sk(sk)->close_it && sk->sk_state == TCP_FIN_WAIT2) { ++ tcp_send_ack(sk); ++ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0); ++ } ++ ++ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) ++ meta_sk->sk_data_ready(meta_sk); ++} ++ ++struct mp_join *mptcp_find_join(const struct sk_buff *skb) ++{ ++ const struct tcphdr *th = tcp_hdr(skb); ++ unsigned char *ptr; ++ int length = (th->doff * 4) - sizeof(struct tcphdr); ++ ++ /* Jump through the options to check whether JOIN is there */ ++ ptr = (unsigned char *)(th + 1); ++ while (length > 0) { ++ int opcode = *ptr++; ++ int opsize; ++ ++ switch (opcode) { ++ case TCPOPT_EOL: ++ return NULL; ++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ ++ length--; ++ continue; ++ default: ++ opsize = *ptr++; ++ if (opsize < 2) /* "silly options" */ ++ return NULL; ++ if (opsize > length) ++ return NULL; /* don't parse partial options */ ++ if (opcode == TCPOPT_MPTCP && ++ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { ++ return (struct mp_join *)(ptr - 2); ++ } ++ ptr += opsize - 2; ++ length -= opsize; ++ } ++ } ++ return NULL; ++} ++ ++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) ++{ ++ struct sock *meta_sk; ++ u32 token; ++ bool meta_v4; ++ struct mp_join *join_opt = mptcp_find_join(skb); ++ if (!join_opt) ++ return 0; ++ ++ /* MPTCP structures were not initialized, so return error */ ++ if (mptcp_init_failed) ++ return -1; ++ ++ token = join_opt->u.syn.token; ++ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); ++ if (!meta_sk) { ++ MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN); ++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); ++ return -1; ++ } ++ ++ meta_v4 = meta_sk->sk_family == AF_INET; ++ if (meta_v4) { ++ if (skb->protocol == htons(ETH_P_IPV6)) { ++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ ++ return -1; ++ } ++ } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) { ++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ ++ return -1; ++ } ++ ++ /* Coming from time-wait-sock processing in tcp_v4_rcv. ++ * We have to deschedule it before continuing, because otherwise ++ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. ++ */ ++ if (tw) ++ inet_twsk_deschedule_put(tw); ++ ++ /* OK, this is a new syn/join, let's create a new open request and ++ * send syn+ack ++ */ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ tcp_v4_do_rcv(meta_sk, skb); ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ tcp_v6_do_rcv(meta_sk, skb); ++#endif /* CONFIG_IPV6 */ ++ } ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ ++ return 1; ++} ++ ++int mptcp_do_join_short(struct sk_buff *skb, ++ const struct mptcp_options_received *mopt, ++ struct net *net) ++{ ++ struct sock *meta_sk; ++ u32 token; ++ bool meta_v4; ++ ++ token = mopt->mptcp_rem_token; ++ meta_sk = mptcp_hash_find(net, token); ++ if (!meta_sk) { ++ MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN); ++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token); ++ return -1; ++ } ++ ++ meta_v4 = meta_sk->sk_family == AF_INET; ++ if (meta_v4) { ++ if (skb->protocol == htons(ETH_P_IPV6)) { ++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ ++ return -1; ++ } ++ } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) { ++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ ++ return -1; ++ } ++ ++ /* OK, this is a new syn/join, let's create a new open request and ++ * send syn+ack ++ */ ++ ++ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as ++ * the skb will finally be freed by tcp_v4_do_rcv (where we are ++ * coming from) ++ */ ++ skb_get(skb); ++ if (skb->protocol == htons(ETH_P_IP)) { ++ tcp_v4_do_rcv(meta_sk, skb); ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { /* IPv6 */ ++ tcp_v6_do_rcv(meta_sk, skb); ++#endif /* CONFIG_IPV6 */ ++ } ++ ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */ ++ return 0; ++} ++ ++/** ++ * Equivalent of tcp_fin() for MPTCP ++ * Can be called only when the FIN is validly part ++ * of the data seqnum space. Not before when we get holes. ++ */ ++void mptcp_fin(struct sock *meta_sk) ++{ ++ struct sock *sk = NULL; ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct mptcp_tcp_sock *mptcp; ++ unsigned char state; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { ++ sk = sk_it; ++ break; ++ } ++ } ++ ++ if (!sk || sk->sk_state == TCP_CLOSE) ++ sk = mptcp_select_ack_sock(meta_sk); ++ ++ inet_csk_schedule_ack(sk); ++ ++ if (!mpcb->in_time_wait) { ++ meta_sk->sk_shutdown |= RCV_SHUTDOWN; ++ sock_set_flag(meta_sk, SOCK_DONE); ++ state = meta_sk->sk_state; ++ } else { ++ state = mpcb->mptw_state; ++ } ++ ++ switch (state) { ++ case TCP_SYN_RECV: ++ case TCP_ESTABLISHED: ++ /* Move to CLOSE_WAIT */ ++ tcp_set_state(meta_sk, TCP_CLOSE_WAIT); ++ inet_csk(sk)->icsk_ack.pingpong = 1; ++ break; ++ ++ case TCP_CLOSE_WAIT: ++ case TCP_CLOSING: ++ /* Received a retransmission of the FIN, do ++ * nothing. ++ */ ++ break; ++ case TCP_LAST_ACK: ++ /* RFC793: Remain in the LAST-ACK state. */ ++ break; ++ ++ case TCP_FIN_WAIT1: ++ /* This case occurs when a simultaneous close ++ * happens, we must ack the received FIN and ++ * enter the CLOSING state. ++ */ ++ tcp_send_ack(sk); ++ tcp_set_state(meta_sk, TCP_CLOSING); ++ break; ++ case TCP_FIN_WAIT2: ++ /* Received a FIN -- send ACK and enter TIME_WAIT. */ ++ tcp_send_ack(sk); ++ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0); ++ break; ++ default: ++ /* Only TCP_LISTEN and TCP_CLOSE are left, in these ++ * cases we should never reach this piece of code. ++ */ ++ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, ++ meta_sk->sk_state); ++ break; ++ } ++ ++ /* It _is_ possible, that we have something out-of-order _after_ FIN. ++ * Probably, we should reset in this case. For now drop them. ++ */ ++ skb_rbtree_purge(&meta_tp->out_of_order_queue); ++ sk_mem_reclaim(meta_sk); ++ ++ if (!sock_flag(meta_sk, SOCK_DEAD)) { ++ meta_sk->sk_state_change(meta_sk); ++ ++ /* Do not send POLL_HUP for half duplex close. */ ++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || ++ meta_sk->sk_state == TCP_CLOSE) ++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); ++ else ++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); ++ } ++ ++ return; ++} ++ ++/* Similar to tcp_xmit_retransmit_queue */ ++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct sk_buff *skb, *rtx_head; ++ ++ if (!meta_tp->packets_out) ++ return; ++ ++ skb = rtx_head = tcp_rtx_queue_head(meta_sk); ++ skb_rbtree_walk_from(skb) { ++ if (mptcp_retransmit_skb(meta_sk, skb)) ++ return; ++ ++ if (skb == rtx_head) ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, ++ inet_csk(meta_sk)->icsk_rto, ++ TCP_RTO_MAX); ++ } ++} ++ ++static void mptcp_snd_una_update(struct tcp_sock *meta_tp, u32 data_ack) ++{ ++ u32 delta = data_ack - meta_tp->snd_una; ++ ++ sock_owned_by_me((struct sock *)meta_tp); ++ meta_tp->bytes_acked += delta; ++ meta_tp->snd_una = data_ack; ++} ++ ++static void mptcp_stop_subflow_chronos(struct sock *meta_sk, ++ const enum tcp_chrono type) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ tcp_chrono_stop(sk_it, type); ++ } ++} ++ ++/* Handle the DATA_ACK */ ++static bool mptcp_process_data_ack(struct sock *sk, const struct sk_buff *skb) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ++ u32 prior_snd_una = meta_tp->snd_una; ++ int prior_packets; ++ u32 nwin, data_ack, data_seq; ++ u16 data_len = 0; ++ ++ /* A valid packet came in - subflow is operational again */ ++ tp->pf = 0; ++ ++ /* Even if there is no data-ack, we stop retransmitting. ++ * Except if this is a SYN/ACK. Then it is just a retransmission ++ */ ++ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { ++ tp->mptcp->pre_established = 0; ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); ++ ++ if (meta_tp->mpcb->pm_ops->established_subflow) ++ meta_tp->mpcb->pm_ops->established_subflow(sk); ++ } ++ ++ /* If we are in infinite mapping mode, rx_opt.data_ack has been ++ * set by mptcp_clean_rtx_infinite. ++ */ ++ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) ++ return false; ++ ++ if (unlikely(!tp->mptcp->fully_established) && ++ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq) ++ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1) ++ * includes a data-ack, we are fully established ++ */ ++ mptcp_become_fully_estab(sk); ++ ++ /* After we did the subflow-only processing (stopping timer and marking ++ * subflow as established), check if we can proceed with MPTCP-level ++ * processing. ++ */ ++ if (meta_sk->sk_state == TCP_CLOSE) ++ return false; ++ ++ /* Get the data_seq */ ++ if (mptcp_is_data_seq(skb)) { ++ data_seq = tp->mptcp->rx_opt.data_seq; ++ data_len = tp->mptcp->rx_opt.data_len; ++ } else { ++ data_seq = meta_tp->snd_wl1; ++ } ++ ++ data_ack = tp->mptcp->rx_opt.data_ack; ++ ++ /* If the ack is older than previous acks ++ * then we can probably ignore it. ++ */ ++ if (before(data_ack, prior_snd_una)) ++ goto exit; ++ ++ /* If the ack includes data we haven't sent yet, discard ++ * this segment (RFC793 Section 3.9). ++ */ ++ if (after(data_ack, meta_tp->snd_nxt)) ++ goto exit; ++ ++ /* First valid DATA_ACK, we can stop sending the special MP_CAPABLE */ ++ tp->mpcb->send_mptcpv1_mpcapable = 0; ++ ++ /*** Now, update the window - inspired by tcp_ack_update_window ***/ ++ nwin = ntohs(tcp_hdr(skb)->window); ++ ++ if (likely(!tcp_hdr(skb)->syn)) ++ nwin <<= tp->rx_opt.snd_wscale; ++ ++ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { ++ tcp_update_wl(meta_tp, data_seq); ++ ++ /* Draft v09, Section 3.3.5: ++ * [...] It should only update its local receive window values ++ * when the largest sequence number allowed (i.e. DATA_ACK + ++ * receive window) increases. [...] ++ */ ++ if (meta_tp->snd_wnd != nwin && ++ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { ++ meta_tp->snd_wnd = nwin; ++ ++ if (nwin > meta_tp->max_window) ++ meta_tp->max_window = nwin; ++ } ++ } ++ /*** Done, update the window ***/ ++ ++ /* We passed data and got it acked, remove any soft error ++ * log. Something worked... ++ */ ++ sk->sk_err_soft = 0; ++ inet_csk(meta_sk)->icsk_probes_out = 0; ++ meta_tp->rcv_tstamp = tcp_jiffies32; ++ prior_packets = meta_tp->packets_out; ++ if (!prior_packets) ++ goto no_queue; ++ ++ mptcp_snd_una_update(meta_tp, data_ack); ++ ++ mptcp_clean_rtx_queue(meta_sk, prior_snd_una); ++ ++ /* We are in loss-state, and something got acked, retransmit the whole ++ * queue now! ++ */ ++ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && ++ after(data_ack, prior_snd_una)) { ++ mptcp_xmit_retransmit_queue(meta_sk); ++ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; ++ } ++ ++ /* Simplified version of tcp_new_space, because the snd-buffer ++ * is handled by all the subflows. ++ */ ++ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { ++ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); ++ if (meta_sk->sk_socket && ++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) ++ meta_sk->sk_write_space(meta_sk); ++ ++ if (meta_sk->sk_socket && ++ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) { ++ tcp_chrono_stop(meta_sk, TCP_CHRONO_SNDBUF_LIMITED); ++ mptcp_stop_subflow_chronos(meta_sk, ++ TCP_CHRONO_SNDBUF_LIMITED); ++ } ++ } ++ ++ if (meta_sk->sk_state != TCP_ESTABLISHED) { ++ int ret = mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len); ++ ++ if (ret < 0) ++ return true; ++ else if (ret > 0) ++ return false; ++ } ++ ++exit: ++ mptcp_push_pending_frames(meta_sk); ++ ++ return false; ++ ++no_queue: ++ if (tcp_send_head(meta_sk)) ++ tcp_ack_probe(meta_sk); ++ ++ mptcp_push_pending_frames(meta_sk); ++ ++ return false; ++} ++ ++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk)); ++ ++ if (!tp->mpcb->infinite_mapping_snd) ++ return; ++ ++ /* The difference between both write_seq's represents the offset between ++ * data-sequence and subflow-sequence. As we are infinite, this must ++ * match. ++ * ++ * Thus, from this difference we can infer the meta snd_una. ++ */ ++ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt + ++ tp->snd_una; ++ ++ mptcp_process_data_ack(sk, skb); ++} ++ ++/**** static functions used by mptcp_parse_options */ ++ ++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { ++ mptcp_reinject_data(sk_it, 0); ++ mptcp_send_reset(sk_it); ++ } ++ } ++} ++ ++static inline bool is_valid_addropt_opsize(u8 mptcp_ver, ++ struct mp_add_addr *mpadd, ++ int opsize) ++{ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (mptcp_ver < MPTCP_VERSION_1 && mpadd->u_bit.v0.ipver == 6) { ++ return opsize == MPTCP_SUB_LEN_ADD_ADDR6 || ++ opsize == MPTCP_SUB_LEN_ADD_ADDR6 + 2; ++ } ++ if (mptcp_ver >= MPTCP_VERSION_1) ++ return opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 || ++ opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2 || ++ opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 || ++ opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2; ++#endif ++ if (mptcp_ver < MPTCP_VERSION_1 && mpadd->u_bit.v0.ipver == 4) { ++ return opsize == MPTCP_SUB_LEN_ADD_ADDR4 || ++ opsize == MPTCP_SUB_LEN_ADD_ADDR4 + 2; ++ } ++ if (mptcp_ver >= MPTCP_VERSION_1) { ++ return opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 || ++ opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2; ++ } ++ return false; ++} ++ ++void mptcp_parse_options(const uint8_t *ptr, int opsize, ++ struct mptcp_options_received *mopt, ++ const struct sk_buff *skb, ++ struct tcp_sock *tp) ++{ ++ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; ++ const struct tcphdr *th = tcp_hdr(skb); ++ ++ /* If the socket is mp-capable we would have a mopt. */ ++ if (!mopt) ++ return; ++ ++ switch (mp_opt->sub) { ++ case MPTCP_SUB_CAPABLE: ++ { ++ const struct mp_capable *mpcapable = (struct mp_capable *)ptr; ++ ++ if (mpcapable->ver == MPTCP_VERSION_0 && ++ ((th->syn && opsize != MPTCP_SUB_LEN_CAPABLE_SYN) || ++ (!th->syn && th->ack && opsize != MPTCP_SUB_LEN_CAPABLE_ACK))) { ++ mptcp_debug("%s: mp_capable v0: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ if (mpcapable->ver == MPTCP_VERSION_1 && ++ ((th->syn && !th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_SYN) || ++ (th->syn && th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_SYNACK) || ++ (!th->syn && th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_ACK && ++ opsize != MPTCPV1_SUB_LEN_CAPABLE_DATA && ++ opsize != MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM))) { ++ mptcp_debug("%s: mp_capable v1: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ /* MPTCP-RFC 6824: ++ * "If receiving a message with the 'B' flag set to 1, and this ++ * is not understood, then this SYN MUST be silently ignored; ++ */ ++ if (mpcapable->b) { ++ mopt->drop_me = 1; ++ break; ++ } ++ ++ /* MPTCP-RFC 6824: ++ * "An implementation that only supports this method MUST set ++ * bit "H" to 1, and bits "C" through "G" to 0." ++ */ ++ if (!mpcapable->h) ++ break; ++ ++ mopt->saw_mpc = 1; ++ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; ++ ++ if (mpcapable->ver == MPTCP_VERSION_0) { ++ if (opsize == MPTCP_SUB_LEN_CAPABLE_SYN) ++ mopt->mptcp_sender_key = mpcapable->sender_key; ++ ++ if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK) { ++ mopt->mptcp_sender_key = mpcapable->sender_key; ++ mopt->mptcp_receiver_key = mpcapable->receiver_key; ++ } ++ } else if (mpcapable->ver == MPTCP_VERSION_1) { ++ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_SYNACK) ++ mopt->mptcp_sender_key = mpcapable->sender_key; ++ ++ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_ACK) { ++ mopt->mptcp_sender_key = mpcapable->sender_key; ++ mopt->mptcp_receiver_key = mpcapable->receiver_key; ++ } ++ ++ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA || ++ opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM) { ++ mopt->mptcp_sender_key = mpcapable->sender_key; ++ mopt->mptcp_receiver_key = mpcapable->receiver_key; ++ ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_MPC_DATA; ++ ++ ptr += sizeof(struct mp_capable); ++ TCP_SKB_CB(skb)->dss_off = (ptr - skb_transport_header(skb)); ++ ++ /* Is a check-sum present? */ ++ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM) ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_DSS_CSUM; ++ } ++ } ++ ++ mopt->mptcp_ver = mpcapable->ver; ++ break; ++ } ++ case MPTCP_SUB_JOIN: ++ { ++ const struct mp_join *mpjoin = (struct mp_join *)ptr; ++ ++ if (opsize != MPTCP_SUB_LEN_JOIN_SYN && ++ opsize != MPTCP_SUB_LEN_JOIN_SYNACK && ++ opsize != MPTCP_SUB_LEN_JOIN_ACK) { ++ mptcp_debug("%s: mp_join: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ /* saw_mpc must be set, because in tcp_check_req we assume that ++ * it is set to support falling back to reg. TCP if a rexmitted ++ * SYN has no MP_CAPABLE or MP_JOIN ++ */ ++ switch (opsize) { ++ case MPTCP_SUB_LEN_JOIN_SYN: ++ mopt->is_mp_join = 1; ++ mopt->saw_mpc = 1; ++ mopt->low_prio = mpjoin->b; ++ mopt->rem_id = mpjoin->addr_id; ++ mopt->mptcp_rem_token = mpjoin->u.syn.token; ++ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; ++ break; ++ case MPTCP_SUB_LEN_JOIN_SYNACK: ++ mopt->saw_mpc = 1; ++ mopt->low_prio = mpjoin->b; ++ mopt->rem_id = mpjoin->addr_id; ++ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; ++ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; ++ break; ++ case MPTCP_SUB_LEN_JOIN_ACK: ++ mopt->saw_mpc = 1; ++ mopt->join_ack = 1; ++ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); ++ break; ++ } ++ break; ++ } ++ case MPTCP_SUB_DSS: ++ { ++ const struct mp_dss *mdss = (struct mp_dss *)ptr; ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ++ ++ /* We check opsize for the csum and non-csum case. We do this, ++ * because the draft says that the csum SHOULD be ignored if ++ * it has not been negotiated in the MP_CAPABLE but still is ++ * present in the data. ++ * ++ * It will get ignored later in mptcp_queue_skb. ++ */ ++ if (opsize != mptcp_sub_len_dss(mdss, 0) && ++ opsize != mptcp_sub_len_dss(mdss, 1)) { ++ mptcp_debug("%s: mp_dss: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ ptr += 4; ++ ++ if (mdss->A) { ++ tcb->mptcp_flags |= MPTCPHDR_ACK; ++ ++ if (mdss->a) { ++ mopt->data_ack = (u32) get_unaligned_be64(ptr); ++ ptr += MPTCP_SUB_LEN_ACK_64; ++ } else { ++ mopt->data_ack = get_unaligned_be32(ptr); ++ ptr += MPTCP_SUB_LEN_ACK; ++ } ++ } ++ ++ tcb->dss_off = (ptr - skb_transport_header(skb)); ++ ++ if (mdss->M) { ++ if (mdss->m) { ++ u64 data_seq64 = get_unaligned_be64(ptr); ++ ++ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; ++ mopt->data_seq = (u32) data_seq64; ++ ++ ptr += 12; /* 64-bit dseq + subseq */ ++ } else { ++ mopt->data_seq = get_unaligned_be32(ptr); ++ ptr += 8; /* 32-bit dseq + subseq */ ++ } ++ mopt->data_len = get_unaligned_be16(ptr); ++ ++ tcb->mptcp_flags |= MPTCPHDR_SEQ; ++ ++ /* Is a check-sum present? */ ++ if (opsize == mptcp_sub_len_dss(mdss, 1)) ++ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; ++ ++ /* DATA_FIN only possible with DSS-mapping */ ++ if (mdss->F) ++ tcb->mptcp_flags |= MPTCPHDR_FIN; ++ } ++ ++ break; ++ } ++ case MPTCP_SUB_ADD_ADDR: ++ { ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; ++ ++ /* If tcp_sock is not available, MPTCP version can't be ++ * retrieved and ADD_ADDR opsize validation is not possible. ++ */ ++ if (!tp || !tp->mpcb) ++ break; ++ ++ if (!is_valid_addropt_opsize(tp->mpcb->mptcp_ver, ++ mpadd, opsize)) { ++ mptcp_debug("%s: mp_add_addr: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ /* We have to manually parse the options if we got two of them. */ ++ if (mopt->saw_add_addr) { ++ mopt->more_add_addr = 1; ++ break; ++ } ++ mopt->saw_add_addr = 1; ++ mopt->add_addr_ptr = ptr; ++ break; ++ } ++ case MPTCP_SUB_REMOVE_ADDR: ++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { ++ mptcp_debug("%s: mp_remove_addr: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ if (mopt->saw_rem_addr) { ++ mopt->more_rem_addr = 1; ++ break; ++ } ++ mopt->saw_rem_addr = 1; ++ mopt->rem_addr_ptr = ptr; ++ break; ++ case MPTCP_SUB_PRIO: ++ { ++ const struct mp_prio *mpprio = (struct mp_prio *)ptr; ++ ++ if (opsize != MPTCP_SUB_LEN_PRIO && ++ opsize != MPTCP_SUB_LEN_PRIO_ADDR) { ++ mptcp_debug("%s: mp_prio: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ mopt->saw_low_prio = 1; ++ mopt->low_prio = mpprio->b; ++ ++ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { ++ mopt->saw_low_prio = 2; ++ mopt->prio_addr_id = mpprio->addr_id; ++ } ++ break; ++ } ++ case MPTCP_SUB_FAIL: ++ if (opsize != MPTCP_SUB_LEN_FAIL) { ++ mptcp_debug("%s: mp_fail: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ mopt->mp_fail = 1; ++ break; ++ case MPTCP_SUB_FCLOSE: ++ if (opsize != MPTCP_SUB_LEN_FCLOSE) { ++ mptcp_debug("%s: mp_fclose: bad option size %d\n", ++ __func__, opsize); ++ break; ++ } ++ ++ mopt->mp_fclose = 1; ++ mopt->mptcp_sender_key = ((struct mp_fclose *)ptr)->key; ++ ++ break; ++ default: ++ mptcp_debug("%s: Received unkown subtype: %d\n", ++ __func__, mp_opt->sub); ++ break; ++ } ++} ++ ++/** Parse only MPTCP options */ ++void tcp_parse_mptcp_options(const struct sk_buff *skb, ++ struct mptcp_options_received *mopt) ++{ ++ const struct tcphdr *th = tcp_hdr(skb); ++ int length = (th->doff * 4) - sizeof(struct tcphdr); ++ const unsigned char *ptr = (const unsigned char *)(th + 1); ++ ++ while (length > 0) { ++ int opcode = *ptr++; ++ int opsize; ++ ++ switch (opcode) { ++ case TCPOPT_EOL: ++ return; ++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ ++ length--; ++ continue; ++ default: ++ opsize = *ptr++; ++ if (opsize < 2) /* "silly options" */ ++ return; ++ if (opsize > length) ++ return; /* don't parse partial options */ ++ if (opcode == TCPOPT_MPTCP) ++ mptcp_parse_options(ptr - 2, opsize, mopt, skb, NULL); ++ } ++ ptr += opsize - 2; ++ length -= opsize; ++ } ++} ++ ++bool mptcp_check_rtt(const struct tcp_sock *tp, int time) ++{ ++ struct mptcp_cb *mpcb = tp->mpcb; ++ struct mptcp_tcp_sock *mptcp; ++ u32 rtt_max = 0; ++ ++ /* In MPTCP, we take the max delay across all flows, ++ * in order to take into account meta-reordering buffers. ++ */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if (!mptcp_sk_can_recv(sk)) ++ continue; ++ ++ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt_us) ++ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt_us; ++ } ++ if (time < (rtt_max >> 3) || !rtt_max) ++ return true; ++ ++ return false; ++} ++ ++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) ++{ ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ union inet_addr addr; ++ sa_family_t family; ++ __be16 port = 0; ++ bool is_v4; ++ ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) { ++ is_v4 = mpadd->u_bit.v0.ipver == 4; ++ } else { ++ is_v4 = mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 || ++ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2; ++ ++ /* TODO: support ADD_ADDRv1 retransmissions */ ++ if (mpadd->u_bit.v1.echo) ++ return; ++ } ++ ++ if (is_v4) { ++ u8 hash_mac_check[SHA256_DIGEST_SIZE]; ++ __be16 hmacport = 0; ++ char *recv_hmac; ++ ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) ++ goto skip_hmac_v4; ++ ++ recv_hmac = (char *)mpadd->u.v4.mac; ++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1) { ++ recv_hmac -= sizeof(mpadd->u.v4.port); ++ } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2) { ++ hmacport = mpadd->u.v4.port; ++ } ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, ++ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 3, ++ 1, (u8 *)&mpadd->addr_id, ++ 4, (u8 *)&mpadd->u.v4.addr.s_addr, ++ 2, (u8 *)&hmacport); ++ if (memcmp(&hash_mac_check[SHA256_DIGEST_SIZE - sizeof(u64)], recv_hmac, 8) != 0) ++ /* ADD_ADDR2 discarded */ ++ return; ++skip_hmac_v4: ++ if ((mpcb->mptcp_ver == MPTCP_VERSION_0 && ++ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) || ++ (mpcb->mptcp_ver == MPTCP_VERSION_1 && ++ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2)) ++ port = mpadd->u.v4.port; ++ family = AF_INET; ++ addr.in = mpadd->u.v4.addr; ++#if IS_ENABLED(CONFIG_IPV6) ++ } else { ++ u8 hash_mac_check[SHA256_DIGEST_SIZE]; ++ __be16 hmacport = 0; ++ char *recv_hmac; ++ ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) ++ goto skip_hmac_v6; ++ ++ recv_hmac = (char *)mpadd->u.v6.mac; ++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1) { ++ recv_hmac -= sizeof(mpadd->u.v6.port); ++ } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2) { ++ hmacport = mpadd->u.v6.port; ++ } ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, ++ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 3, ++ 1, (u8 *)&mpadd->addr_id, ++ 16, (u8 *)&mpadd->u.v6.addr.s6_addr, ++ 2, (u8 *)&hmacport); ++ if (memcmp(&hash_mac_check[SHA256_DIGEST_SIZE - sizeof(u64)], recv_hmac, 8) != 0) ++ /* ADD_ADDR2 discarded */ ++ return; ++skip_hmac_v6: ++ if ((mpcb->mptcp_ver == MPTCP_VERSION_0 && ++ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) || ++ (mpcb->mptcp_ver == MPTCP_VERSION_1 && ++ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2)) ++ port = mpadd->u.v6.port; ++ family = AF_INET6; ++ addr.in6 = mpadd->u.v6.addr; ++#endif /* CONFIG_IPV6 */ ++ } ++ ++ if (mpcb->pm_ops->add_raddr) ++ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRRX); ++} ++ ++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) ++{ ++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; ++ int i; ++ u8 rem_id; ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ ++ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { ++ rem_id = (&mprem->addrs_id)[i]; ++ ++ if (mpcb->pm_ops->rem_raddr) ++ mpcb->pm_ops->rem_raddr(mpcb, rem_id); ++ mptcp_send_reset_rem_id(mpcb, rem_id); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRSUB); ++ } ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRRX); ++} ++ ++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) ++{ ++ struct tcphdr *th = tcp_hdr(skb); ++ unsigned char *ptr; ++ int length = (th->doff * 4) - sizeof(struct tcphdr); ++ ++ /* Jump through the options to check whether ADD_ADDR is there */ ++ ptr = (unsigned char *)(th + 1); ++ while (length > 0) { ++ int opcode = *ptr++; ++ int opsize; ++ ++ switch (opcode) { ++ case TCPOPT_EOL: ++ return; ++ case TCPOPT_NOP: ++ length--; ++ continue; ++ default: ++ opsize = *ptr++; ++ if (opsize < 2) ++ return; ++ if (opsize > length) ++ return; /* don't parse partial options */ ++ if (opcode == TCPOPT_MPTCP && ++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { ++ u8 mptcp_ver = tcp_sk(sk)->mpcb->mptcp_ver; ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; ++ ++ if (!is_valid_addropt_opsize(mptcp_ver, mpadd, ++ opsize)) ++ goto cont; ++ ++ mptcp_handle_add_addr(ptr, sk); ++ } ++ if (opcode == TCPOPT_MPTCP && ++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { ++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) ++ goto cont; ++ ++ mptcp_handle_rem_addr(ptr, sk); ++ } ++cont: ++ ptr += opsize - 2; ++ length -= opsize; ++ } ++ } ++ return; ++} ++ ++static bool mptcp_mp_fastclose_rcvd(struct sock *sk) ++{ ++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ ++ if (likely(!mptcp->rx_opt.mp_fclose)) ++ return false; ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FASTCLOSERX); ++ mptcp->rx_opt.mp_fclose = 0; ++ if (mptcp->rx_opt.mptcp_sender_key != mpcb->mptcp_loc_key) ++ return false; ++ ++ mptcp_sub_force_close_all(mpcb, NULL); ++ ++ tcp_reset(mptcp_meta_sk(sk)); ++ ++ return true; ++} ++ ++static void mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) ++{ ++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); ++ mptcp->rx_opt.mp_fail = 0; ++ ++ if (!th->rst && !mpcb->infinite_mapping_snd) { ++ mpcb->send_infinite_mapping = 1; ++ ++ mptcp_restart_sending(meta_sk); ++ ++ mptcp_fallback_close(mpcb, sk); ++ } ++} ++ ++static inline void mptcp_path_array_check(struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ ++ if (unlikely(mpcb->list_rcvd)) { ++ mpcb->list_rcvd = 0; ++ if (mpcb->pm_ops->new_remote_address) ++ mpcb->pm_ops->new_remote_address(meta_sk); ++ } ++} ++ ++bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; ++ struct mptcp_cb *mpcb = tp->mpcb; ++ ++ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) ++ return false; ++ ++ if (mptcp_mp_fastclose_rcvd(sk)) ++ return true; ++ ++ if (sk->sk_state == TCP_RST_WAIT && !th->rst) ++ return true; ++ ++ if (mopt->saw_mpc && !tp->mpcb->rem_key_set) ++ mptcp_initialize_recv_vars(mptcp_meta_tp(tp), tp->mpcb, ++ mopt->mptcp_sender_key); ++ ++ if (unlikely(mopt->mp_fail)) ++ mptcp_mp_fail_rcvd(sk, th); ++ ++ /* RFC 6824, Section 3.3: ++ * If a checksum is not present when its use has been negotiated, the ++ * receiver MUST close the subflow with a RST as it is considered broken. ++ */ ++ if ((mptcp_is_data_seq(skb) || mptcp_is_data_mpcapable(skb)) && ++ tp->mpcb->dss_csum && ++ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { ++ mptcp_send_reset(sk); ++ return true; ++ } ++ ++ /* We have to acknowledge retransmissions of the third ++ * ack. ++ */ ++ if (mopt->join_ack) { ++ tcp_send_delayed_ack(sk); ++ mopt->join_ack = 0; ++ } ++ ++ if (mopt->saw_add_addr || mopt->saw_rem_addr) { ++ if (mopt->more_add_addr || mopt->more_rem_addr) { ++ mptcp_parse_addropt(skb, sk); ++ } else { ++ if (mopt->saw_add_addr) ++ mptcp_handle_add_addr(mopt->add_addr_ptr, sk); ++ if (mopt->saw_rem_addr) ++ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); ++ } ++ ++ mopt->more_add_addr = 0; ++ mopt->saw_add_addr = 0; ++ mopt->more_rem_addr = 0; ++ mopt->saw_rem_addr = 0; ++ } ++ if (mopt->saw_low_prio) { ++ if (mopt->saw_low_prio == 1) { ++ tp->mptcp->rcv_low_prio = mopt->low_prio; ++ if (mpcb->pm_ops->prio_changed) ++ mpcb->pm_ops->prio_changed(sk, mopt->low_prio); ++ } else { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ if (mptcp->rem_id == mopt->prio_addr_id) { ++ mptcp->rcv_low_prio = mopt->low_prio; ++ if (mpcb->pm_ops->prio_changed) ++ mpcb->pm_ops->prio_changed(sk, ++ mopt->low_prio); ++ } ++ } ++ } ++ mopt->saw_low_prio = 0; ++ } ++ ++ if (mptcp_process_data_ack(sk, skb)) ++ return true; ++ ++ mptcp_path_array_check(mptcp_meta_sk(sk)); ++ /* Socket may have been mp_killed by a REMOVE_ADDR */ ++ if (tp->mp_killed) ++ return true; ++ ++ return false; ++} ++ ++static void _mptcp_rcv_synsent_fastopen(struct sock *meta_sk, ++ struct sk_buff *skb, bool rtx_queue) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); ++ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una; ++ ++ /* If the server only acknowledges partially the data sent in ++ * the SYN, we need to trim the acknowledged part because ++ * we don't want to retransmit this already received data. ++ * When we reach this point, tcp_ack() has already cleaned up ++ * fully acked segments. However, tcp trims partially acked ++ * segments only when retransmitting. Since MPTCP comes into ++ * play only now, we will fake an initial transmit, and ++ * retransmit_skb() will not be called. The following fragment ++ * comes from __tcp_retransmit_skb(). ++ */ ++ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) { ++ BUG_ON(before(TCP_SKB_CB(skb)->end_seq, master_tp->snd_una)); ++ /* tcp_trim_head can only returns ENOMEM if skb is ++ * cloned. It is not the case here (see ++ * tcp_send_syn_data). ++ */ ++ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una - ++ TCP_SKB_CB(skb)->seq)); ++ } ++ ++ TCP_SKB_CB(skb)->seq += new_mapping; ++ TCP_SKB_CB(skb)->end_seq += new_mapping; ++ TCP_SKB_CB(skb)->sacked = 0; ++ ++ list_del(&skb->tcp_tsorted_anchor); ++ ++ if (rtx_queue) ++ tcp_rtx_queue_unlink(skb, meta_sk); ++ ++ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); ++ ++ if (rtx_queue) ++ tcp_add_write_queue_tail(meta_sk, skb); ++} ++ ++/* In case of fastopen, some data can already be in the write queue. ++ * We need to update the sequence number of the segments as they ++ * were initially TCP sequence numbers. ++ */ ++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); ++ struct sk_buff *skb_write_head, *skb_rtx_head, *tmp; ++ ++ skb_write_head = tcp_write_queue_head(meta_sk); ++ skb_rtx_head = tcp_rtx_queue_head(meta_sk); ++ ++ if (!(skb_write_head || skb_rtx_head)) ++ return; ++ ++ /* There should only be one skb in {write, rtx} queue: the data not ++ * acknowledged in the SYN+ACK. In this case, we need to map ++ * this data to data sequence numbers. ++ */ ++ ++ BUG_ON(skb_write_head && skb_rtx_head); ++ ++ if (skb_write_head) { ++ skb_queue_walk_from_safe(&meta_sk->sk_write_queue, ++ skb_write_head, tmp) { ++ _mptcp_rcv_synsent_fastopen(meta_sk, skb_write_head, ++ false); ++ } ++ } ++ ++ if (skb_rtx_head) { ++ skb_rbtree_walk_from_safe(skb_rtx_head, tmp) { ++ _mptcp_rcv_synsent_fastopen(meta_sk, skb_rtx_head, ++ true); ++ } ++ } ++ ++ /* We can advance write_seq by the number of bytes unacknowledged ++ * and that were mapped in the previous loop. ++ */ ++ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una; ++ ++ /* The packets from the master_sk will be entailed to it later ++ * Until that time, its write queue is empty, and ++ * write_seq must align with snd_una ++ */ ++ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una; ++ master_tp->packets_out = 0; ++ tcp_clear_retrans(meta_tp); ++ tcp_clear_retrans(master_tp); ++ tcp_set_ca_state(meta_tp->mpcb->master_sk, TCP_CA_Open); ++ tcp_set_ca_state(meta_sk, TCP_CA_Open); ++} ++ ++/* The skptr is needed, because if we become MPTCP-capable, we have to switch ++ * from meta-socket to master-socket. ++ * ++ * @return: 1 - we want to reset this connection ++ * 2 - we want to discard the received syn/ack ++ * 0 - everything is fine - continue ++ */ ++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, ++ const struct sk_buff *skb, ++ const struct mptcp_options_received *mopt) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (mptcp(tp)) { ++ u8 hash_mac_check[SHA256_DIGEST_SIZE]; ++ struct mptcp_cb *mpcb = tp->mpcb; ++ ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, ++ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 2, ++ 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, ++ 4, (u8 *)&tp->mptcp->mptcp_loc_nonce); ++ if (memcmp(hash_mac_check, ++ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); ++ mptcp_sub_force_close(sk); ++ return 1; ++ } ++ ++ /* Set this flag in order to postpone data sending ++ * until the 4th ack arrives. ++ */ ++ tp->mptcp->pre_established = 1; ++ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; ++ ++ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, ++ (u8 *)&mpcb->mptcp_rem_key, ++ tp->mptcp->sender_mac, 2, ++ 4, (u8 *)&tp->mptcp->mptcp_loc_nonce, ++ 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); ++ } else if (mopt->saw_mpc) { ++ struct sock *meta_sk = sk; ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); ++ if (mopt->mptcp_ver > tcp_sk(sk)->mptcp_ver) ++ /* TODO Consider adding new MPTCP_INC_STATS entry */ ++ goto fallback; ++ if (tcp_sk(sk)->mptcp_ver == MPTCP_VERSION_1 && ++ mopt->mptcp_ver < MPTCP_VERSION_1) ++ /* TODO Consider adding new MPTCP_INC_STATS entry */ ++ /* TODO - record this in the cache - use v0 next time */ ++ goto fallback; ++ ++ if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key, 1, ++ mopt->mptcp_ver, ++ ntohs(tcp_hdr(skb)->window))) ++ return 2; ++ ++ sk = tcp_sk(sk)->mpcb->master_sk; ++ *skptr = sk; ++ tp = tcp_sk(sk); ++ ++ /* If fastopen was used data might be in the send queue. We ++ * need to update their sequence number to MPTCP-level seqno. ++ * Note that it can happen in rare cases that fastopen_req is ++ * NULL and syn_data is 0 but fastopen indeed occurred and ++ * data has been queued in the write queue (but not sent). ++ * Example of such rare cases: connect is non-blocking and ++ * TFO is configured to work without cookies. ++ */ ++ mptcp_rcv_synsent_fastopen(meta_sk); ++ ++ /* -1, because the SYN consumed 1 byte. In case of TFO, we ++ * start the subflow-sequence number as if the data of the SYN ++ * is not part of any mapping. ++ */ ++ tp->mptcp->snt_isn = tp->snd_una - 1; ++ tp->mpcb->dss_csum = mopt->dss_csum; ++ if (tp->mpcb->dss_csum) ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMENABLED); ++ ++ if (tp->mpcb->mptcp_ver >= MPTCP_VERSION_1) ++ tp->mpcb->send_mptcpv1_mpcapable = 1; ++ ++ tp->mptcp->include_mpc = 1; ++ ++ sk_set_socket(sk, meta_sk->sk_socket); ++ sk->sk_wq = meta_sk->sk_wq; ++ ++ bh_unlock_sock(sk); ++ /* hold in sk_clone_lock due to initialization to 2 */ ++ sock_put(sk); ++ } else { ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); ++fallback: ++ tp->request_mptcp = 0; ++ ++ if (tp->inside_tk_table) ++ mptcp_hash_remove_bh(tp); ++ } ++ ++ if (mptcp(tp)) ++ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; ++ ++ return 0; ++} ++ ++/* Similar to tcp_should_expand_sndbuf */ ++bool mptcp_should_expand_sndbuf(const struct sock *sk) ++{ ++ const struct sock *meta_sk = mptcp_meta_sk(sk); ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ const struct mptcp_tcp_sock *mptcp; ++ ++ /* We circumvent this check in tcp_check_space, because we want to ++ * always call sk_write_space. So, we reproduce the check here. ++ */ ++ if (!meta_sk->sk_socket || ++ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) ++ return false; ++ ++ /* If the user specified a specific send buffer setting, do ++ * not modify it. ++ */ ++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) ++ return false; ++ ++ /* If we are under global TCP memory pressure, do not expand. */ ++ if (tcp_under_memory_pressure(meta_sk)) ++ return false; ++ ++ /* If we are under soft global TCP memory pressure, do not expand. */ ++ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0)) ++ return false; ++ ++ /* For MPTCP we look for a subsocket that could send data. ++ * If we found one, then we update the send-buffer. ++ */ ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ const struct sock *sk_it = mptcp_to_sock(mptcp); ++ const struct tcp_sock *tp_it = tcp_sk(sk_it); ++ ++ if (!mptcp_sk_can_send(sk_it)) ++ continue; ++ ++ if (tcp_packets_in_flight(tp_it) < tp_it->snd_cwnd) ++ return true; ++ } ++ ++ return false; ++} ++ ++void mptcp_tcp_set_rto(struct sock *sk) ++{ ++ tcp_set_rto(sk); ++ mptcp_set_rto(sk); ++} +diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c +new file mode 100644 +index 000000000000..0370a7680d47 +--- /dev/null ++++ b/net/mptcp/mptcp_ipv4.c +@@ -0,0 +1,431 @@ ++/* ++ * MPTCP implementation - IPv4-specific functions ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) ++{ ++ return siphash_4u32((__force u32)saddr, (__force u32)daddr, ++ (__force u32)sport << 16 | (__force u32)dport, ++ mptcp_seed++, &mptcp_secret); ++} ++ ++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, ++ u32 seed) ++{ ++ return siphash_2u64((__force u64)saddr << 32 | (__force u64)daddr, ++ (__force u64)seed << 32 | (__force u64)sport << 16 | (__force u64)dport, ++ &mptcp_secret); ++} ++ ++ ++static void mptcp_v4_reqsk_destructor(struct request_sock *req) ++{ ++ mptcp_reqsk_destructor(req); ++ ++ tcp_v4_reqsk_destructor(req); ++} ++ ++static int mptcp_v4_init_req(struct request_sock *req, const struct sock *sk, ++ struct sk_buff *skb, bool want_cookie) ++{ ++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb, want_cookie); ++ ++ mptcp_rsk(req)->hash_entry.pprev = NULL; ++ mptcp_rsk(req)->is_sub = 0; ++ inet_rsk(req)->mptcp_rqsk = 1; ++ ++ /* In case of SYN-cookies, we wait for the isn to be generated - it is ++ * input to the key-generation. ++ */ ++ if (!want_cookie) ++ mptcp_reqsk_init(req, sk, skb, false); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SYN_COOKIES ++static u32 mptcp_v4_cookie_init_seq(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mssp) ++{ ++ __u32 isn = cookie_v4_init_sequence(req, sk, skb, mssp); ++ ++ tcp_rsk(req)->snt_isn = isn; ++ ++ mptcp_reqsk_init(req, sk, skb, true); ++ ++ return isn; ++} ++#endif ++ ++/* May be called without holding the meta-level lock */ ++static int mptcp_v4_join_init_req(struct request_sock *req, const struct sock *meta_sk, ++ struct sk_buff *skb, bool want_cookie) ++{ ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ union inet_addr addr; ++ int loc_id; ++ bool low_prio = false; ++ ++ if (!mpcb->rem_key_set) ++ return -1; ++ ++ /* We need to do this as early as possible. Because, if we fail later ++ * (e.g., get_local_id), then reqsk_free tries to remove the ++ * request-socket from the htb in mptcp_hash_request_remove as pprev ++ * may be different from NULL. ++ */ ++ mtreq->hash_entry.pprev = NULL; ++ ++ tcp_request_sock_ipv4_ops.init_req(req, meta_sk, skb, want_cookie); ++ ++ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr, ++ ip_hdr(skb)->daddr, ++ tcp_hdr(skb)->source, ++ tcp_hdr(skb)->dest); ++ addr.ip = inet_rsk(req)->ir_loc_addr; ++ loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET, &addr, &low_prio); ++ if (loc_id == -1) ++ return -1; ++ mtreq->loc_id = loc_id; ++ mtreq->low_prio = low_prio; ++ ++ mptcp_join_reqsk_init(mpcb, req, skb); ++ ++ return 0; ++} ++ ++/* Similar to tcp_request_sock_ops */ ++struct request_sock_ops mptcp_request_sock_ops __read_mostly = { ++ .family = PF_INET, ++ .obj_size = sizeof(struct mptcp_request_sock), ++ .rtx_syn_ack = tcp_rtx_synack, ++ .send_ack = tcp_v4_reqsk_send_ack, ++ .destructor = mptcp_v4_reqsk_destructor, ++ .send_reset = tcp_v4_send_reset, ++ .syn_ack_timeout = tcp_syn_ack_timeout, ++}; ++ ++/* Similar to: tcp_v4_conn_request ++ * May be called without holding the meta-level lock ++ */ ++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ return tcp_conn_request(&mptcp_request_sock_ops, ++ &mptcp_join_request_sock_ipv4_ops, ++ meta_sk, skb); ++} ++ ++/* Similar to: tcp_v4_do_rcv ++ * We only process join requests here. (either the SYN or the final ACK) ++ */ ++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ const struct tcphdr *th = tcp_hdr(skb); ++ const struct iphdr *iph = ip_hdr(skb); ++ struct sock *child, *rsk = NULL, *sk; ++ int ret; ++ ++ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo, ++ iph->saddr, th->source, iph->daddr, ++ th->dest, inet_iif(skb)); ++ ++ if (!sk) ++ goto new_subflow; ++ ++ if (is_meta_sk(sk)) { ++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); ++ sock_put(sk); ++ goto discard; ++ } ++ ++ if (sk->sk_state == TCP_TIME_WAIT) { ++ inet_twsk_put(inet_twsk(sk)); ++ goto discard; ++ } ++ ++ if (sk->sk_state == TCP_NEW_SYN_RECV) { ++ struct request_sock *req = inet_reqsk(sk); ++ bool req_stolen; ++ ++ if (!mptcp_can_new_subflow(meta_sk)) ++ goto reset_and_discard; ++ ++ local_bh_disable(); ++ child = tcp_check_req(meta_sk, skb, req, false, &req_stolen); ++ if (!child) { ++ reqsk_put(req); ++ local_bh_enable(); ++ goto discard; ++ } ++ ++ if (child != meta_sk) { ++ ret = mptcp_finish_handshake(child, skb); ++ if (ret) { ++ rsk = child; ++ local_bh_enable(); ++ goto reset_and_discard; ++ } ++ ++ bh_unlock_sock(meta_sk); ++ local_bh_enable(); ++ return 0; ++ } ++ ++ /* tcp_check_req failed */ ++ reqsk_put(req); ++ ++ local_bh_enable(); ++ goto discard; ++ } ++ ++ ret = tcp_v4_do_rcv(sk, skb); ++ sock_put(sk); ++ ++ return ret; ++ ++new_subflow: ++ if (!mptcp_can_new_subflow(meta_sk)) ++ goto reset_and_discard; ++ ++ child = tcp_v4_cookie_check(meta_sk, skb); ++ if (!child) ++ goto discard; ++ ++ if (child != meta_sk) { ++ ret = mptcp_finish_handshake(child, skb); ++ if (ret) { ++ rsk = child; ++ goto reset_and_discard; ++ } ++ } ++ ++ if (tcp_hdr(skb)->syn) { ++ local_bh_disable(); ++ mptcp_v4_join_request(meta_sk, skb); ++ local_bh_enable(); ++ } ++ ++discard: ++ kfree_skb(skb); ++ return 0; ++ ++reset_and_discard: ++ tcp_v4_send_reset(rsk, skb); ++ goto discard; ++} ++ ++/* Create a new IPv4 subflow. ++ * ++ * We are in user-context and meta-sock-lock is hold. ++ */ ++int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, ++ __be16 sport, struct mptcp_rem4 *rem, ++ struct sock **subsk) ++{ ++ struct tcp_sock *tp; ++ struct sock *sk; ++ struct sockaddr_in loc_in, rem_in; ++ struct socket_alloc sock_full; ++ struct socket *sock = (struct socket *)&sock_full; ++ int ret; ++ ++ /** First, create and prepare the new socket */ ++ memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full)); ++ sock->state = SS_UNCONNECTED; ++ sock->ops = NULL; ++ ++ ret = inet_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1); ++ if (unlikely(ret < 0)) { ++ net_err_ratelimited("%s inet_create failed ret: %d\n", ++ __func__, ret); ++ return ret; ++ } ++ ++ sk = sock->sk; ++ tp = tcp_sk(sk); ++ ++ /* All subsockets need the MPTCP-lock-class */ ++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name); ++ lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0); ++ ++ ret = mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL); ++ if (ret) { ++ net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n", ++ __func__, ret); ++ goto error; ++ } ++ ++ tp->mptcp->slave_sk = 1; ++ tp->mptcp->low_prio = loc->low_prio; ++ ++ /* Initializing the timer for an MPTCP subflow */ ++ timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); ++ ++ /** Then, connect the socket to the peer */ ++ loc_in.sin_family = AF_INET; ++ rem_in.sin_family = AF_INET; ++ loc_in.sin_port = sport; ++ if (rem->port) ++ rem_in.sin_port = rem->port; ++ else ++ rem_in.sin_port = inet_sk(meta_sk)->inet_dport; ++ loc_in.sin_addr = loc->addr; ++ rem_in.sin_addr = rem->addr; ++ ++ if (loc->if_idx) ++ sk->sk_bound_dev_if = loc->if_idx; ++ ++ ret = kernel_bind(sock, (struct sockaddr *)&loc_in, ++ sizeof(struct sockaddr_in)); ++ if (ret < 0) { ++ net_err_ratelimited("%s: token %#x bind() to %pI4 index %d failed, error %d\n", ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, ++ &loc_in.sin_addr, loc->if_idx, ret); ++ goto error; ++ } ++ ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n", ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, &loc_in.sin_addr, ++ ntohs(loc_in.sin_port), &rem_in.sin_addr, ++ ntohs(rem_in.sin_port), loc->if_idx); ++ ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4) ++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr); ++ ++ ret = kernel_connect(sock, (struct sockaddr *)&rem_in, ++ sizeof(struct sockaddr_in), O_NONBLOCK); ++ if (ret < 0 && ret != -EINPROGRESS) { ++ net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", ++ __func__, ret); ++ goto error; ++ } ++ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX); ++ ++ sk_set_socket(sk, meta_sk->sk_socket); ++ sk->sk_wq = meta_sk->sk_wq; ++ ++ if (subsk) ++ *subsk = sk; ++ ++ return 0; ++ ++error: ++ /* May happen if mptcp_add_sock fails first */ ++ if (!mptcp(tp)) { ++ tcp_close(sk, 0); ++ } else { ++ local_bh_disable(); ++ mptcp_sub_force_close(sk); ++ local_bh_enable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__mptcp_init4_subsockets); ++ ++const struct inet_connection_sock_af_ops mptcp_v4_specific = { ++ .queue_xmit = ip_queue_xmit, ++ .send_check = tcp_v4_send_check, ++ .rebuild_header = inet_sk_rebuild_header, ++ .sk_rx_dst_set = inet_sk_rx_dst_set, ++ .conn_request = mptcp_conn_request, ++ .syn_recv_sock = tcp_v4_syn_recv_sock, ++ .net_header_len = sizeof(struct iphdr), ++ .setsockopt = ip_setsockopt, ++ .getsockopt = ip_getsockopt, ++ .addr2sockaddr = inet_csk_addr2sockaddr, ++ .sockaddr_len = sizeof(struct sockaddr_in), ++#ifdef CONFIG_COMPAT ++ .compat_setsockopt = compat_ip_setsockopt, ++ .compat_getsockopt = compat_ip_getsockopt, ++#endif ++ .mtu_reduced = tcp_v4_mtu_reduced, ++}; ++ ++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; ++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; ++ ++/* General initialization of IPv4 for MPTCP */ ++int mptcp_pm_v4_init(void) ++{ ++ int ret = 0; ++ struct request_sock_ops *ops = &mptcp_request_sock_ops; ++ ++ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; ++ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req; ++#ifdef CONFIG_SYN_COOKIES ++ mptcp_request_sock_ipv4_ops.cookie_init_seq = mptcp_v4_cookie_init_seq; ++#endif ++ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; ++ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req; ++ ++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP"); ++ if (ops->slab_name == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, ++ SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, ++ NULL); ++ ++ if (ops->slab == NULL) { ++ ret = -ENOMEM; ++ goto err_reqsk_create; ++ } ++ ++out: ++ return ret; ++ ++err_reqsk_create: ++ kfree(ops->slab_name); ++ ops->slab_name = NULL; ++ goto out; ++} ++ ++void mptcp_pm_v4_undo(void) ++{ ++ kmem_cache_destroy(mptcp_request_sock_ops.slab); ++ kfree(mptcp_request_sock_ops.slab_name); ++} +diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c +new file mode 100644 +index 000000000000..8af32df4fd5f +--- /dev/null ++++ b/net/mptcp/mptcp_ipv6.c +@@ -0,0 +1,479 @@ ++/* ++ * MPTCP implementation - IPv6-specific functions ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer: ++ * Jaakko Korkeaniemi ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, ++ __be16 sport, __be16 dport) ++{ ++ const struct { ++ struct in6_addr saddr; ++ struct in6_addr daddr; ++ u32 seed; ++ __be16 sport; ++ __be16 dport; ++ } __aligned(SIPHASH_ALIGNMENT) combined = { ++ .saddr = *(struct in6_addr *)saddr, ++ .daddr = *(struct in6_addr *)daddr, ++ .seed = mptcp_seed++, ++ .sport = sport, ++ .dport = dport ++ }; ++ ++ return siphash(&combined, offsetofend(typeof(combined), dport), ++ &mptcp_secret); ++} ++ ++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, ++ __be16 sport, __be16 dport, u32 seed) ++{ ++ const struct { ++ struct in6_addr saddr; ++ struct in6_addr daddr; ++ u32 seed; ++ __be16 sport; ++ __be16 dport; ++ } __aligned(SIPHASH_ALIGNMENT) combined = { ++ .saddr = *(struct in6_addr *)saddr, ++ .daddr = *(struct in6_addr *)daddr, ++ .seed = seed, ++ .sport = sport, ++ .dport = dport ++ }; ++ ++ return siphash(&combined, offsetofend(typeof(combined), dport), ++ &mptcp_secret); ++} ++ ++static void mptcp_v6_reqsk_destructor(struct request_sock *req) ++{ ++ mptcp_reqsk_destructor(req); ++ ++ tcp_v6_reqsk_destructor(req); ++} ++ ++static int mptcp_v6_init_req(struct request_sock *req, const struct sock *sk, ++ struct sk_buff *skb, bool want_cookie) ++{ ++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb, want_cookie); ++ ++ mptcp_rsk(req)->hash_entry.pprev = NULL; ++ mptcp_rsk(req)->is_sub = 0; ++ inet_rsk(req)->mptcp_rqsk = 1; ++ ++ /* In case of SYN-cookies, we wait for the isn to be generated - it is ++ * input to the key-generation. ++ */ ++ if (!want_cookie) ++ mptcp_reqsk_init(req, sk, skb, false); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SYN_COOKIES ++static u32 mptcp_v6_cookie_init_seq(struct request_sock *req, const struct sock *sk, ++ const struct sk_buff *skb, __u16 *mssp) ++{ ++ __u32 isn = cookie_v6_init_sequence(req, sk, skb, mssp); ++ ++ tcp_rsk(req)->snt_isn = isn; ++ ++ mptcp_reqsk_init(req, sk, skb, true); ++ ++ return isn; ++} ++#endif ++ ++/* May be called without holding the meta-level lock */ ++static int mptcp_v6_join_init_req(struct request_sock *req, const struct sock *meta_sk, ++ struct sk_buff *skb, bool want_cookie) ++{ ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req); ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ union inet_addr addr; ++ int loc_id; ++ bool low_prio = false; ++ ++ if (!mpcb->rem_key_set) ++ return -1; ++ ++ /* We need to do this as early as possible. Because, if we fail later ++ * (e.g., get_local_id), then reqsk_free tries to remove the ++ * request-socket from the htb in mptcp_hash_request_remove as pprev ++ * may be different from NULL. ++ */ ++ mtreq->hash_entry.pprev = NULL; ++ ++ tcp_request_sock_ipv6_ops.init_req(req, meta_sk, skb, want_cookie); ++ ++ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32, ++ ipv6_hdr(skb)->daddr.s6_addr32, ++ tcp_hdr(skb)->source, ++ tcp_hdr(skb)->dest); ++ addr.in6 = inet_rsk(req)->ir_v6_loc_addr; ++ loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET6, &addr, &low_prio); ++ if (loc_id == -1) ++ return -1; ++ mtreq->loc_id = loc_id; ++ mtreq->low_prio = low_prio; ++ ++ mptcp_join_reqsk_init(mpcb, req, skb); ++ ++ return 0; ++} ++ ++/* Similar to tcp6_request_sock_ops */ ++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = { ++ .family = AF_INET6, ++ .obj_size = sizeof(struct mptcp_request_sock), ++ .rtx_syn_ack = tcp_rtx_synack, ++ .send_ack = tcp_v6_reqsk_send_ack, ++ .destructor = mptcp_v6_reqsk_destructor, ++ .send_reset = tcp_v6_send_reset, ++ .syn_ack_timeout = tcp_syn_ack_timeout, ++}; ++ ++/* Similar to: tcp_v6_conn_request ++ * May be called without holding the meta-level lock ++ */ ++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ return tcp_conn_request(&mptcp6_request_sock_ops, ++ &mptcp_join_request_sock_ipv6_ops, ++ meta_sk, skb); ++} ++ ++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ const struct tcphdr *th = tcp_hdr(skb); ++ const struct ipv6hdr *ip6h = ipv6_hdr(skb); ++ struct sock *child, *rsk = NULL, *sk; ++ int ret; ++ ++ sk = __inet6_lookup_established(sock_net(meta_sk), ++ &tcp_hashinfo, ++ &ip6h->saddr, th->source, ++ &ip6h->daddr, ntohs(th->dest), ++ tcp_v6_iif(skb), tcp_v6_sdif(skb)); ++ ++ if (!sk) ++ goto new_subflow; ++ ++ if (is_meta_sk(sk)) { ++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); ++ sock_put(sk); ++ goto discard; ++ } ++ ++ if (sk->sk_state == TCP_TIME_WAIT) { ++ inet_twsk_put(inet_twsk(sk)); ++ goto discard; ++ } ++ ++ if (sk->sk_state == TCP_NEW_SYN_RECV) { ++ struct request_sock *req = inet_reqsk(sk); ++ bool req_stolen; ++ ++ if (!mptcp_can_new_subflow(meta_sk)) ++ goto reset_and_discard; ++ ++ local_bh_disable(); ++ child = tcp_check_req(meta_sk, skb, req, false, &req_stolen); ++ if (!child) { ++ reqsk_put(req); ++ local_bh_enable(); ++ goto discard; ++ } ++ ++ if (child != meta_sk) { ++ ret = mptcp_finish_handshake(child, skb); ++ if (ret) { ++ rsk = child; ++ local_bh_enable(); ++ goto reset_and_discard; ++ } ++ ++ bh_unlock_sock(meta_sk); ++ local_bh_enable(); ++ return 0; ++ } ++ ++ /* tcp_check_req failed */ ++ reqsk_put(req); ++ ++ local_bh_enable(); ++ goto discard; ++ } ++ ++ ret = tcp_v6_do_rcv(sk, skb); ++ sock_put(sk); ++ ++ return ret; ++ ++new_subflow: ++ if (!mptcp_can_new_subflow(meta_sk)) ++ goto reset_and_discard; ++ ++ child = tcp_v6_cookie_check(meta_sk, skb); ++ if (!child) ++ goto discard; ++ ++ if (child != meta_sk) { ++ ret = mptcp_finish_handshake(child, skb); ++ if (ret) { ++ rsk = child; ++ goto reset_and_discard; ++ } ++ } ++ ++ if (tcp_hdr(skb)->syn) { ++ local_bh_disable(); ++ mptcp_v6_join_request(meta_sk, skb); ++ local_bh_enable(); ++ } ++ ++discard: ++ kfree_skb(skb); ++ return 0; ++ ++reset_and_discard: ++ tcp_v6_send_reset(rsk, skb); ++ goto discard; ++} ++ ++/* Create a new IPv6 subflow. ++ * ++ * We are in user-context and meta-sock-lock is hold. ++ */ ++int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, ++ __be16 sport, struct mptcp_rem6 *rem, ++ struct sock **subsk) ++{ ++ struct tcp_sock *tp; ++ struct sock *sk; ++ struct sockaddr_in6 loc_in, rem_in; ++ struct socket_alloc sock_full; ++ struct socket *sock = (struct socket *)&sock_full; ++ int ret; ++ ++ /** First, create and prepare the new socket */ ++ memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full)); ++ sock->state = SS_UNCONNECTED; ++ sock->ops = NULL; ++ ++ ret = inet6_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1); ++ if (unlikely(ret < 0)) { ++ net_err_ratelimited("%s inet6_create failed ret: %d\n", ++ __func__, ret); ++ return ret; ++ } ++ ++ sk = sock->sk; ++ tp = tcp_sk(sk); ++ ++ /* All subsockets need the MPTCP-lock-class */ ++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name); ++ lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0); ++ ++ ret = mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL); ++ if (ret) { ++ net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n", ++ __func__, ret); ++ goto error; ++ } ++ ++ tp->mptcp->slave_sk = 1; ++ tp->mptcp->low_prio = loc->low_prio; ++ ++ /* Initializing the timer for an MPTCP subflow */ ++ timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); ++ ++ /** Then, connect the socket to the peer */ ++ loc_in.sin6_family = AF_INET6; ++ rem_in.sin6_family = AF_INET6; ++ loc_in.sin6_port = sport; ++ if (rem->port) ++ rem_in.sin6_port = rem->port; ++ else ++ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; ++ loc_in.sin6_addr = loc->addr; ++ rem_in.sin6_addr = rem->addr; ++ ++ if (loc->if_idx) ++ sk->sk_bound_dev_if = loc->if_idx; ++ ++ ret = kernel_bind(sock, (struct sockaddr *)&loc_in, ++ sizeof(struct sockaddr_in6)); ++ if (ret < 0) { ++ net_err_ratelimited("%s: token %#x bind() to %pI6 index %d failed, error %d\n", ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, ++ &loc_in.sin6_addr, loc->if_idx, ret); ++ goto error; ++ } ++ ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n", ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, &loc_in.sin6_addr, ++ ntohs(loc_in.sin6_port), &rem_in.sin6_addr, ++ ntohs(rem_in.sin6_port), loc->if_idx); ++ ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6) ++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr); ++ ++ ret = kernel_connect(sock, (struct sockaddr *)&rem_in, ++ sizeof(struct sockaddr_in6), O_NONBLOCK); ++ if (ret < 0 && ret != -EINPROGRESS) { ++ net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", ++ __func__, ret); ++ goto error; ++ } ++ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX); ++ ++ sk_set_socket(sk, meta_sk->sk_socket); ++ sk->sk_wq = meta_sk->sk_wq; ++ ++ if (subsk) ++ *subsk = sk; ++ ++ return 0; ++ ++error: ++ /* May happen if mptcp_add_sock fails first */ ++ if (!mptcp(tp)) { ++ tcp_close(sk, 0); ++ } else { ++ local_bh_disable(); ++ mptcp_sub_force_close(sk); ++ local_bh_enable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__mptcp_init6_subsockets); ++ ++const struct inet_connection_sock_af_ops mptcp_v6_specific = { ++ .queue_xmit = inet6_csk_xmit, ++ .send_check = tcp_v6_send_check, ++ .rebuild_header = inet6_sk_rebuild_header, ++ .sk_rx_dst_set = inet6_sk_rx_dst_set, ++ .conn_request = mptcp_conn_request, ++ .syn_recv_sock = tcp_v6_syn_recv_sock, ++ .net_header_len = sizeof(struct ipv6hdr), ++ .net_frag_header_len = sizeof(struct frag_hdr), ++ .setsockopt = ipv6_setsockopt, ++ .getsockopt = ipv6_getsockopt, ++ .addr2sockaddr = inet6_csk_addr2sockaddr, ++ .sockaddr_len = sizeof(struct sockaddr_in6), ++#ifdef CONFIG_COMPAT ++ .compat_setsockopt = compat_ipv6_setsockopt, ++ .compat_getsockopt = compat_ipv6_getsockopt, ++#endif ++ .mtu_reduced = tcp_v6_mtu_reduced, ++}; ++ ++const struct inet_connection_sock_af_ops mptcp_v6_mapped = { ++ .queue_xmit = ip_queue_xmit, ++ .send_check = tcp_v4_send_check, ++ .rebuild_header = inet_sk_rebuild_header, ++ .sk_rx_dst_set = inet_sk_rx_dst_set, ++ .conn_request = mptcp_conn_request, ++ .syn_recv_sock = tcp_v6_syn_recv_sock, ++ .net_header_len = sizeof(struct iphdr), ++ .setsockopt = ipv6_setsockopt, ++ .getsockopt = ipv6_getsockopt, ++ .addr2sockaddr = inet6_csk_addr2sockaddr, ++ .sockaddr_len = sizeof(struct sockaddr_in6), ++#ifdef CONFIG_COMPAT ++ .compat_setsockopt = compat_ipv6_setsockopt, ++ .compat_getsockopt = compat_ipv6_getsockopt, ++#endif ++ .mtu_reduced = tcp_v4_mtu_reduced, ++}; ++ ++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; ++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; ++ ++int mptcp_pm_v6_init(void) ++{ ++ int ret = 0; ++ struct request_sock_ops *ops = &mptcp6_request_sock_ops; ++ ++ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; ++ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req; ++#ifdef CONFIG_SYN_COOKIES ++ mptcp_request_sock_ipv6_ops.cookie_init_seq = mptcp_v6_cookie_init_seq; ++#endif ++ ++ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; ++ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req; ++ ++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6"); ++ if (ops->slab_name == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, ++ SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, ++ NULL); ++ ++ if (ops->slab == NULL) { ++ ret = -ENOMEM; ++ goto err_reqsk_create; ++ } ++ ++out: ++ return ret; ++ ++err_reqsk_create: ++ kfree(ops->slab_name); ++ ops->slab_name = NULL; ++ goto out; ++} ++ ++void mptcp_pm_v6_undo(void) ++{ ++ kmem_cache_destroy(mptcp6_request_sock_ops.slab); ++ kfree(mptcp6_request_sock_ops.slab_name); ++} +diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c +new file mode 100644 +index 000000000000..cf019990447c +--- /dev/null ++++ b/net/mptcp/mptcp_ndiffports.c +@@ -0,0 +1,174 @@ ++#include ++ ++#include ++#include ++ ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#endif ++ ++struct ndiffports_priv { ++ /* Worker struct for subflow establishment */ ++ struct work_struct subflow_work; ++ ++ struct mptcp_cb *mpcb; ++}; ++ ++static int num_subflows __read_mostly = 2; ++module_param(num_subflows, int, 0644); ++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection"); ++ ++/** ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets ++ * ++ * This function uses a goto next_subflow, to allow releasing the lock between ++ * new subflows and giving other processes a chance to do some work on the ++ * socket and potentially finishing the communication. ++ **/ ++static void create_subflow_worker(struct work_struct *work) ++{ ++ const struct ndiffports_priv *pm_priv = container_of(work, ++ struct ndiffports_priv, ++ subflow_work); ++ struct mptcp_cb *mpcb = pm_priv->mpcb; ++ struct sock *meta_sk = mpcb->meta_sk; ++ int iter = 0; ++ ++next_subflow: ++ if (iter) { ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ ++ cond_resched(); ++ } ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ if (!mptcp(tcp_sk(meta_sk))) ++ goto exit; ++ ++ iter++; ++ ++ if (sock_flag(meta_sk, SOCK_DEAD)) ++ goto exit; ++ ++ if (mpcb->master_sk && ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) ++ goto exit; ++ ++ if (num_subflows > iter && num_subflows > mptcp_subflow_count(mpcb)) { ++ if (meta_sk->sk_family == AF_INET || ++ mptcp_v6_is_v4_mapped(meta_sk)) { ++ struct mptcp_loc4 loc; ++ struct mptcp_rem4 rem; ++ ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; ++ loc.loc4_id = 0; ++ loc.low_prio = 0; ++ if (mpcb->master_sk) ++ loc.if_idx = mpcb->master_sk->sk_bound_dev_if; ++ else ++ loc.if_idx = 0; ++ ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; ++ rem.port = inet_sk(meta_sk)->inet_dport; ++ rem.rem4_id = 0; /* Default 0 */ ++ ++ mptcp_init4_subsockets(meta_sk, &loc, &rem); ++ } else { ++#if IS_ENABLED(CONFIG_IPV6) ++ struct mptcp_loc6 loc; ++ struct mptcp_rem6 rem; ++ ++ loc.addr = inet6_sk(meta_sk)->saddr; ++ loc.loc6_id = 0; ++ loc.low_prio = 0; ++ if (mpcb->master_sk) ++ loc.if_idx = mpcb->master_sk->sk_bound_dev_if; ++ else ++ loc.if_idx = 0; ++ ++ rem.addr = meta_sk->sk_v6_daddr; ++ rem.port = inet_sk(meta_sk)->inet_dport; ++ rem.rem6_id = 0; /* Default 0 */ ++ ++ mptcp_init6_subsockets(meta_sk, &loc, &rem); ++#endif ++ } ++ goto next_subflow; ++ } ++ ++exit: ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ mptcp_mpcb_put(mpcb); ++ sock_put(meta_sk); ++} ++ ++static void ndiffports_new_session(const struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; ++ ++ /* Initialize workqueue-struct */ ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker); ++ fmp->mpcb = mpcb; ++} ++ ++static void ndiffports_create_subflows(struct sock *meta_sk) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; ++ ++ if (mptcp_in_infinite_mapping_weak(mpcb) || ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) ++ return; ++ ++ if (!work_pending(&pm_priv->subflow_work)) { ++ sock_hold(meta_sk); ++ refcount_inc(&mpcb->mpcb_refcnt); ++ queue_work(mptcp_wq, &pm_priv->subflow_work); ++ } ++} ++ ++static int ndiffports_get_local_id(const struct sock *meta_sk, ++ sa_family_t family, union inet_addr *addr, ++ bool *low_prio) ++{ ++ return 0; ++} ++ ++static struct mptcp_pm_ops ndiffports __read_mostly = { ++ .new_session = ndiffports_new_session, ++ .fully_established = ndiffports_create_subflows, ++ .get_local_id = ndiffports_get_local_id, ++ .name = "ndiffports", ++ .owner = THIS_MODULE, ++}; ++ ++/* General initialization of MPTCP_PM */ ++static int __init ndiffports_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE); ++ ++ if (mptcp_register_path_manager(&ndiffports)) ++ goto exit; ++ ++ return 0; ++ ++exit: ++ return -1; ++} ++ ++static void ndiffports_unregister(void) ++{ ++ mptcp_unregister_path_manager(&ndiffports); ++} ++ ++module_init(ndiffports_register); ++module_exit(ndiffports_unregister); ++ ++MODULE_AUTHOR("Christoph Paasch"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP"); ++MODULE_VERSION("0.88"); +diff --git a/net/mptcp/mptcp_netlink.c b/net/mptcp/mptcp_netlink.c +new file mode 100644 +index 000000000000..dd696841ea85 +--- /dev/null ++++ b/net/mptcp/mptcp_netlink.c +@@ -0,0 +1,1272 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* MPTCP implementation - Netlink Path Manager ++ * ++ * Analysis, Design and Implementation: ++ * - Gregory Detal ++ * - SĂ©bastien BarrĂ© ++ * - Matthieu Baerts ++ * - Pau Espin Pedrol ++ * - Detlev Casanova ++ * - David Verbeiren ++ * - Frank Vanbever ++ * - Antoine Maes ++ * - Tim Froidcoeur ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++#include ++#include ++#include ++#include ++#include ++#if IS_ENABLED(CONFIG_IPV6) ++#include ++#endif ++ ++#define MPTCP_MAX_ADDR 8 ++ ++struct mptcp_nl_priv { ++ /* Unfortunately we need to store this to generate MP_JOINs in case ++ * of the peer generating a subflow (see get_local_id). ++ */ ++ u8 loc4_bits; ++ u8 announced4; ++ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ u8 loc6_bits; ++ u8 announced6; ++ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; ++#endif ++ ++ u16 remove_addrs; ++ ++ bool is_closed; ++}; ++ ++static struct genl_family mptcp_genl_family; ++ ++#define MPTCP_GENL_EV_GRP_OFFSET 0 ++#define MPTCP_GENL_CMD_GRP_OFFSET 1 ++ ++static const struct genl_multicast_group mptcp_mcgrps[] = { ++ [MPTCP_GENL_EV_GRP_OFFSET] = { .name = MPTCP_GENL_EV_GRP_NAME, }, ++ [MPTCP_GENL_CMD_GRP_OFFSET] = { .name = MPTCP_GENL_CMD_GRP_NAME, }, ++}; ++ ++static const struct nla_policy mptcp_nl_genl_policy[MPTCP_ATTR_MAX + 1] = { ++ [MPTCP_ATTR_TOKEN] = { .type = NLA_U32, }, ++ [MPTCP_ATTR_FAMILY] = { .type = NLA_U16, }, ++ [MPTCP_ATTR_LOC_ID] = { .type = NLA_U8, }, ++ [MPTCP_ATTR_REM_ID] = { .type = NLA_U8, }, ++ [MPTCP_ATTR_SADDR4] = { .type = NLA_U32, }, ++ [MPTCP_ATTR_SADDR6] = { .type = NLA_BINARY, ++ .len = sizeof(struct in6_addr), }, ++ [MPTCP_ATTR_DADDR4] = { .type = NLA_U32, }, ++ [MPTCP_ATTR_DADDR6] = { .type = NLA_BINARY, ++ .len = sizeof(struct in6_addr), }, ++ [MPTCP_ATTR_SPORT] = { .type = NLA_U16, }, ++ [MPTCP_ATTR_DPORT] = { .type = NLA_U16, }, ++ [MPTCP_ATTR_BACKUP] = { .type = NLA_U8, }, ++ [MPTCP_ATTR_FLAGS] = { .type = NLA_U16, }, ++ [MPTCP_ATTR_TIMEOUT] = { .type = NLA_U32, }, ++ [MPTCP_ATTR_IF_IDX] = { .type = NLA_S32, }, ++}; ++ ++/* Defines the userspace PM filter on events. Set events are ignored. */ ++static u16 mptcp_nl_event_filter; ++ ++static inline struct mptcp_nl_priv * ++mptcp_nl_priv(const struct sock *meta_sk) ++{ ++ return (struct mptcp_nl_priv *)&tcp_sk(meta_sk)->mpcb->mptcp_pm[0]; ++} ++ ++static inline bool ++mptcp_nl_must_notify(u16 event, const struct sock *meta_sk) ++{ ++ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); ++ ++ /* close_session() can be called before other events because it is ++ * also called when doing a fallback to TCP. We don't want to send ++ * events to the user-space after having sent the CLOSED event. ++ */ ++ if (priv->is_closed) ++ return false; ++ ++ if (event == MPTCPF_EVENT_CLOSED) ++ priv->is_closed = true; ++ ++ if (mptcp_nl_event_filter & event) ++ return false; ++ ++ if (!genl_has_listeners(&mptcp_genl_family, sock_net(meta_sk), 0)) ++ return false; ++ ++ return true; ++} ++ ++/* Find the first free index in the bitfield starting from 0 */ ++static int ++mptcp_nl_find_free_index(u8 bitfield) ++{ ++ int i; ++ ++ /* There are anyways no free bits... */ ++ if (bitfield == 0xff) ++ return -1; ++ ++ i = ffs(~bitfield) - 1; ++ if (i < 0) ++ return -1; ++ ++ return i; ++} ++ ++static inline int ++mptcp_nl_put_subsk(struct sk_buff *msg, struct sock *sk) ++{ ++ struct inet_sock *isk = inet_sk(sk); ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ u8 backup; ++ u8 sk_err; ++ ++ if (nla_put_u16(msg, MPTCP_ATTR_FAMILY, sk->sk_family)) ++ goto nla_put_failure; ++ ++ if (nla_put_u8(msg, MPTCP_ATTR_LOC_ID, tcp_sk(sk)->mptcp->loc_id)) ++ goto nla_put_failure; ++ ++ if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, tcp_sk(sk)->mptcp->rem_id)) ++ goto nla_put_failure; ++ ++ switch (sk->sk_family) { ++ case AF_INET: ++ if (nla_put_u32(msg, MPTCP_ATTR_SADDR4, isk->inet_saddr)) ++ goto nla_put_failure; ++ ++ if (nla_put_u32(msg, MPTCP_ATTR_DADDR4, isk->inet_daddr)) ++ goto nla_put_failure; ++ break; ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: { ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ if (nla_put(msg, MPTCP_ATTR_SADDR6, sizeof(np->saddr), ++ &np->saddr)) ++ goto nla_put_failure; ++ ++ if (nla_put(msg, MPTCP_ATTR_DADDR6, sizeof(sk->sk_v6_daddr), ++ &sk->sk_v6_daddr)) ++ goto nla_put_failure; ++ break; ++ } ++#endif ++ default: ++ goto nla_put_failure; ++ } ++ ++ if (nla_put_u16(msg, MPTCP_ATTR_SPORT, ntohs(isk->inet_sport))) ++ goto nla_put_failure; ++ ++ if (nla_put_u16(msg, MPTCP_ATTR_DPORT, ntohs(isk->inet_dport))) ++ goto nla_put_failure; ++ ++ backup = !!(tcp_sk(sk)->mptcp->rcv_low_prio || ++ tcp_sk(sk)->mptcp->low_prio); ++ ++ if (nla_put_u8(msg, MPTCP_ATTR_BACKUP, backup)) ++ goto nla_put_failure; ++ ++ if (nla_put_s32(msg, MPTCP_ATTR_IF_IDX, sk->sk_bound_dev_if)) ++ goto nla_put_failure; ++ ++ sk_err = sk->sk_err ? : tcp_sk(sk)->mptcp->sk_err; ++ if (unlikely(sk_err != 0) && meta_sk->sk_state == TCP_ESTABLISHED && ++ nla_put_u8(msg, MPTCP_ATTR_ERROR, sk_err)) ++ goto nla_put_failure; ++ ++ return 0; ++ ++nla_put_failure: ++ return -1; ++} ++ ++static inline struct sk_buff * ++mptcp_nl_mcast_prepare(struct mptcp_cb *mpcb, struct sock *sk, int cmd, ++ void **hdr) ++{ ++ struct sk_buff *msg; ++ ++ /* possible optimisation: use the needed size */ ++ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); ++ if (!msg) ++ return NULL; ++ ++ *hdr = genlmsg_put(msg, 0, 0, &mptcp_genl_family, 0, cmd); ++ if (!*hdr) ++ goto free_msg; ++ ++ if (nla_put_u32(msg, MPTCP_ATTR_TOKEN, mpcb->mptcp_loc_token)) ++ goto nla_put_failure; ++ ++ if (sk && mptcp_nl_put_subsk(msg, sk)) ++ goto nla_put_failure; ++ ++ return msg; ++ ++nla_put_failure: ++ genlmsg_cancel(msg, *hdr); ++free_msg: ++ nlmsg_free(msg); ++ return NULL; ++} ++ ++static inline int ++mptcp_nl_mcast_send(struct mptcp_cb *mpcb, struct sk_buff *msg, void *hdr) ++{ ++ int ret; ++ struct sock *meta_sk = mpcb->meta_sk; ++ ++ genlmsg_end(msg, hdr); ++ ++ ret = genlmsg_multicast_netns(&mptcp_genl_family, sock_net(meta_sk), ++ msg, 0, MPTCP_GENL_EV_GRP_OFFSET, ++ GFP_ATOMIC); ++ if (ret && ret != -ESRCH) ++ pr_err("%s: genlmsg_multicast failed with %d\n", __func__, ret); ++ return ret; ++} ++ ++static inline void ++mptcp_nl_mcast(struct mptcp_cb *mpcb, struct sock *sk, int cmd) ++{ ++ void *hdr; ++ struct sk_buff *msg; ++ ++ msg = mptcp_nl_mcast_prepare(mpcb, sk, cmd, &hdr); ++ if (msg) ++ mptcp_nl_mcast_send(mpcb, msg, hdr); ++ else ++ pr_warn("%s: unable to prepare multicast message\n", __func__); ++} ++ ++static inline void ++mptcp_nl_mcast_fail(struct sk_buff *msg, void *hdr) ++{ ++ genlmsg_cancel(msg, hdr); ++ nlmsg_free(msg); ++} ++ ++static void ++mptcp_nl_new(const struct sock *meta_sk, bool established) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ ++ mptcp_nl_mcast(mpcb, mpcb->master_sk, ++ established ? MPTCP_EVENT_ESTABLISHED ++ : MPTCP_EVENT_CREATED); ++} ++ ++static void ++mptcp_nl_pm_new_session(const struct sock *meta_sk) ++{ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_CREATED, meta_sk)) ++ return; ++ ++ mptcp_nl_new(meta_sk, false); ++} ++ ++static inline int ++mptcp_nl_loc_id_to_index_lookup(struct sock *meta_sk, sa_family_t family, ++ u8 addr_id) ++{ ++ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); ++ int i; ++ ++ switch (family) { ++ case AF_INET: ++ mptcp_for_each_bit_set(priv->loc4_bits, i) { ++ if (priv->locaddr4[i].loc4_id == addr_id) ++ return i; ++ } ++ break; ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: ++ mptcp_for_each_bit_set(priv->loc6_bits, i) { ++ if (priv->locaddr6[i].loc6_id == addr_id) ++ return i; ++ } ++ break; ++#endif ++ } ++ return -1; ++} ++ ++static inline void ++mptcp_nl_sk_setup_locaddr(struct sock *meta_sk, struct sock *sk) ++{ ++ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); ++ bool backup = !!(tcp_sk(sk)->mptcp->rcv_low_prio || ++ tcp_sk(sk)->mptcp->low_prio); ++ sa_family_t family = mptcp_v6_is_v4_mapped(sk) ? AF_INET ++ : sk->sk_family; ++ u8 addr_id = tcp_sk(sk)->mptcp->loc_id; ++ int idx = mptcp_nl_loc_id_to_index_lookup(meta_sk, family, ++ addr_id); ++ ++ /* Same as in mptcp_fullmesh.c: exception for transparent sockets */ ++ int if_idx = inet_sk(sk)->transparent ? inet_sk(sk)->rx_dst_ifindex : ++ sk->sk_bound_dev_if; ++ ++ switch (family) { ++ case AF_INET: { ++ struct inet_sock *isk = inet_sk(sk); ++ ++ if (idx == -1) ++ idx = mptcp_nl_find_free_index(priv->loc4_bits); ++ if (idx == -1) { ++ pr_warn("No free index for sk loc_id v4\n"); ++ return; ++ } ++ priv->locaddr4[idx].addr.s_addr = isk->inet_saddr; ++ priv->locaddr4[idx].loc4_id = addr_id; ++ priv->locaddr4[idx].low_prio = backup; ++ priv->locaddr4[idx].if_idx = if_idx; ++ priv->loc4_bits |= 1 << idx; ++ priv->announced4 |= 1 << idx; ++ break; ++ } ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: { ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ if (idx == -1) ++ idx = mptcp_nl_find_free_index(priv->loc6_bits); ++ if (idx == -1) { ++ pr_warn("No free index for sk loc_id v6\n"); ++ return; ++ } ++ priv->locaddr6[idx].addr = np->saddr; ++ priv->locaddr6[idx].loc6_id = addr_id; ++ priv->locaddr6[idx].low_prio = backup; ++ priv->locaddr6[idx].if_idx = if_idx; ++ priv->loc6_bits |= 1 << idx; ++ priv->announced6 |= 1 << idx; ++ break; ++ } ++#endif ++ } ++} ++ ++static void ++mptcp_nl_pm_fully_established(struct sock *meta_sk) ++{ ++ mptcp_nl_sk_setup_locaddr(meta_sk, tcp_sk(meta_sk)->mpcb->master_sk); ++ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_ESTABLISHED, meta_sk)) ++ return; ++ ++ mptcp_nl_new(meta_sk, true); ++} ++ ++static void ++mptcp_nl_pm_close_session(struct sock *meta_sk) ++{ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_CLOSED, meta_sk)) ++ return; ++ ++ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, NULL, MPTCP_EVENT_CLOSED); ++} ++ ++static void ++mptcp_nl_pm_established_subflow(struct sock *sk) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ mptcp_nl_sk_setup_locaddr(meta_sk, sk); ++ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_ESTABLISHED, meta_sk)) ++ return; ++ ++ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_ESTABLISHED); ++} ++ ++static void ++mptcp_nl_pm_delete_subflow(struct sock *sk) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_CLOSED, meta_sk)) ++ return; ++ ++ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_CLOSED); ++} ++ ++static void ++mptcp_nl_pm_add_raddr(struct mptcp_cb *mpcb, const union inet_addr *addr, ++ sa_family_t family, __be16 port, u8 id) ++{ ++ struct sk_buff *msg; ++ void *hdr; ++ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_ANNOUNCED, mpcb->meta_sk)) ++ return; ++ ++ msg = mptcp_nl_mcast_prepare(mpcb, NULL, MPTCP_EVENT_ANNOUNCED, &hdr); ++ if (!msg) ++ return; ++ ++ if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, id)) ++ goto nla_put_failure; ++ ++ if (nla_put_u16(msg, MPTCP_ATTR_FAMILY, family)) ++ goto nla_put_failure; ++ ++ switch (family) { ++ case AF_INET: ++ if (nla_put_u32(msg, MPTCP_ATTR_DADDR4, addr->ip)) ++ goto nla_put_failure; ++ break; ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: ++ if (nla_put(msg, MPTCP_ATTR_DADDR6, sizeof(addr->ip6), ++ &addr->ip6)) ++ goto nla_put_failure; ++ break; ++#endif ++ default: ++ goto nla_put_failure; ++ } ++ ++ if (nla_put_u16(msg, MPTCP_ATTR_DPORT, ntohs(port))) ++ goto nla_put_failure; ++ ++ mptcp_nl_mcast_send(mpcb, msg, hdr); ++ ++ return; ++ ++nla_put_failure: ++ mptcp_nl_mcast_fail(msg, hdr); ++} ++ ++static void ++mptcp_nl_pm_rem_raddr(struct mptcp_cb *mpcb, u8 id) ++{ ++ struct sk_buff *msg; ++ void *hdr; ++ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_REMOVED, mpcb->meta_sk)) ++ return; ++ ++ msg = mptcp_nl_mcast_prepare(mpcb, NULL, MPTCP_EVENT_REMOVED, &hdr); ++ ++ if (!msg) ++ return; ++ ++ if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, id)) ++ goto nla_put_failure; ++ ++ mptcp_nl_mcast_send(mpcb, msg, hdr); ++ ++ return; ++ ++nla_put_failure: ++ mptcp_nl_mcast_fail(msg, hdr); ++} ++ ++static int ++mptcp_nl_pm_get_local_id(const struct sock *meta_sk, sa_family_t family, ++ union inet_addr *addr, bool *low_prio) ++{ ++ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); ++ int i, id = 0; ++ ++ switch (family) { ++ case AF_INET: ++ mptcp_for_each_bit_set(priv->loc4_bits, i) { ++ if (addr->in.s_addr == priv->locaddr4[i].addr.s_addr) { ++ id = priv->locaddr4[i].loc4_id; ++ *low_prio = priv->locaddr4[i].low_prio; ++ goto out; ++ } ++ } ++ break; ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: ++ mptcp_for_each_bit_set(priv->loc6_bits, i) { ++ if (ipv6_addr_equal(&addr->in6, ++ &priv->locaddr6[i].addr)) { ++ id = priv->locaddr6[i].loc6_id; ++ *low_prio = priv->locaddr6[i].low_prio; ++ goto out; ++ } ++ } ++ break; ++#endif ++ } ++ return -1; ++ ++out: ++ return id; ++} ++ ++static void ++mptcp_nl_pm_addr_signal(struct sock *sk, unsigned *size, ++ struct tcp_out_options *opts, struct sk_buff *skb) ++{ ++ struct mptcp_nl_priv *priv = mptcp_nl_priv(sk); ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ u8 unannounced; ++ int remove_addr_len; ++ ++ unannounced = (~priv->announced4) & priv->loc4_bits; ++ if (unannounced && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { ++ int i = mptcp_nl_find_free_index(~unannounced); ++ ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_ADD_ADDR; ++ opts->add_addr4.addr_id = priv->locaddr4[i].loc4_id; ++ opts->add_addr4.addr = priv->locaddr4[i].addr; ++ opts->add_addr_v4 = 1; ++ ++ if (skb) ++ priv->announced4 |= (1 << i); ++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; ++ } ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ unannounced = (~priv->announced6) & priv->loc6_bits; ++ if (unannounced && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { ++ int i = mptcp_nl_find_free_index(~unannounced); ++ ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_ADD_ADDR; ++ opts->add_addr6.addr_id = priv->locaddr6[i].loc6_id; ++ opts->add_addr6.addr = priv->locaddr6[i].addr; ++ opts->add_addr_v6 = 1; ++ ++ if (skb) ++ priv->announced6 |= (1 << i); ++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; ++ } ++#endif ++ ++ if (likely(!priv->remove_addrs)) ++ goto exit; ++ ++ remove_addr_len = mptcp_sub_len_remove_addr_align(priv->remove_addrs); ++ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) ++ goto exit; ++ ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_REMOVE_ADDR; ++ opts->remove_addrs = priv->remove_addrs; ++ ++ if (skb) ++ priv->remove_addrs = 0; ++ *size += remove_addr_len; ++ ++exit: ++ mpcb->addr_signal = !!((~priv->announced4) & priv->loc4_bits || ++#if IS_ENABLED(CONFIG_IPV6) ++ (~priv->announced6) & priv->loc6_bits || ++#endif ++ priv->remove_addrs); ++} ++ ++static void ++mptcp_nl_pm_prio_changed(struct sock *sk, int low_prio) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_PRIORITY, meta_sk)) ++ return; ++ ++ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_PRIORITY); ++} ++ ++static int ++mptcp_nl_genl_announce(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct sock *meta_sk, *subsk; ++ struct mptcp_cb *mpcb; ++ struct mptcp_nl_priv *priv; ++ u32 token; ++ u8 addr_id, backup = 0; ++ u16 family; ++ int i, ret = 0; ++ union inet_addr saddr; ++ int if_idx = 0; ++ bool useless; /* unused out parameter "low_prio" */ ++ ++ if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_FAMILY] || ++ !info->attrs[MPTCP_ATTR_LOC_ID]) ++ return -EINVAL; ++ ++ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); ++ meta_sk = mptcp_hash_find(genl_info_net(info), token); ++ if (!meta_sk) ++ return -EINVAL; ++ ++ mpcb = tcp_sk(meta_sk)->mpcb; ++ priv = mptcp_nl_priv(meta_sk); ++ family = nla_get_u16(info->attrs[MPTCP_ATTR_FAMILY]); ++ addr_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]); ++ ++ if (info->attrs[MPTCP_ATTR_BACKUP]) ++ backup = nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]); ++ ++ if (info->attrs[MPTCP_ATTR_IF_IDX]) ++ if_idx = nla_get_s32(info->attrs[MPTCP_ATTR_IF_IDX]); ++ ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ switch (family) { ++ case AF_INET: ++ if (!info->attrs[MPTCP_ATTR_SADDR4]) { ++ ret = -EINVAL; ++ goto exit; ++ } ++ ++ saddr.in.s_addr = nla_get_u32(info->attrs[MPTCP_ATTR_SADDR4]); ++ i = mptcp_nl_pm_get_local_id(meta_sk, family, ++ &saddr, &useless); ++ if (i < 0) { ++ i = mptcp_nl_find_free_index(priv->loc4_bits); ++ if (i < 0) { ++ ret = -ENOBUFS; ++ goto exit; ++ } ++ } else if (i != addr_id) { ++ ret = -EINVAL; ++ goto exit; ++ } ++ ++ priv->locaddr4[i].addr.s_addr = saddr.in.s_addr; ++ priv->locaddr4[i].loc4_id = addr_id; ++ priv->locaddr4[i].low_prio = !!backup; ++ priv->locaddr4[i].if_idx = if_idx; ++ priv->loc4_bits |= 1 << i; ++ priv->announced4 &= ~(1 << i); ++ break; ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: ++ if (!info->attrs[MPTCP_ATTR_SADDR6]) { ++ ret = -EINVAL; ++ goto exit; ++ } ++ ++ saddr.in6 = *(struct in6_addr *) ++ nla_data(info->attrs[MPTCP_ATTR_SADDR6]); ++ i = mptcp_nl_pm_get_local_id(meta_sk, family, &saddr, &useless); ++ if (i < 0) { ++ i = mptcp_nl_find_free_index(priv->loc6_bits); ++ if (i < 0) { ++ ret = -ENOBUFS; ++ goto exit; ++ } ++ } else if (i != addr_id) { ++ ret = -EINVAL; ++ goto exit; ++ } ++ ++ priv->locaddr6[i].addr = saddr.in6; ++ priv->locaddr6[i].loc6_id = addr_id; ++ priv->locaddr6[i].low_prio = !!backup; ++ priv->locaddr6[i].if_idx = if_idx; ++ priv->loc6_bits |= 1 << i; ++ priv->announced6 &= ~(1 << i); ++ break; ++#endif ++ default: ++ ret = -EINVAL; ++ goto exit; ++ } ++ ++ mpcb->addr_signal = 1; ++ ++ rcu_read_lock_bh(); ++ subsk = mptcp_select_ack_sock(meta_sk); ++ if (subsk) ++ tcp_send_ack(subsk); ++ rcu_read_unlock_bh(); ++ ++exit: ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ sock_put(meta_sk); ++ return ret; ++} ++ ++static int ++mptcp_nl_genl_remove(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct sock *meta_sk, *subsk; ++ struct mptcp_cb *mpcb; ++ struct mptcp_nl_priv *priv; ++ u32 token; ++ u8 addr_id; ++ int i; ++ int retcode; ++ bool found = false; ++ ++ if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_LOC_ID]) ++ return -EINVAL; ++ ++ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); ++ meta_sk = mptcp_hash_find(genl_info_net(info), token); ++ if (!meta_sk) ++ return -EINVAL; ++ ++ mpcb = tcp_sk(meta_sk)->mpcb; ++ priv = mptcp_nl_priv(meta_sk); ++ addr_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]); ++ ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ mptcp_for_each_bit_set(priv->loc4_bits, i) { ++ if (priv->locaddr4[i].loc4_id == addr_id) { ++ priv->loc4_bits &= ~(1 << i); ++ found = true; ++ break; ++ } ++ } ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (!found) { ++ mptcp_for_each_bit_set(priv->loc6_bits, i) { ++ if (priv->locaddr6[i].loc6_id == addr_id) { ++ priv->loc6_bits &= ~(1 << i); ++ found = true; ++ break; ++ } ++ } ++ } ++#endif ++ ++ if (found) { ++ priv->remove_addrs |= 1 << addr_id; ++ mpcb->addr_signal = 1; ++ ++ rcu_read_lock_bh(); ++ subsk = mptcp_select_ack_sock(meta_sk); ++ if (subsk) ++ tcp_send_ack(subsk); ++ rcu_read_unlock_bh(); ++ retcode = 0; ++ } else { ++ retcode = -EINVAL; ++ } ++ ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ sock_put(meta_sk); ++ return retcode; ++} ++ ++static int ++mptcp_nl_genl_create(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct sock *meta_sk, *subsk = NULL; ++ struct mptcp_cb *mpcb; ++ struct mptcp_nl_priv *priv; ++ u32 token; ++ u16 family, sport; ++ u8 loc_id, rem_id, backup = 0; ++ int i, ret = 0; ++ int if_idx; ++ ++ if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_FAMILY] || ++ !info->attrs[MPTCP_ATTR_LOC_ID] || !info->attrs[MPTCP_ATTR_REM_ID]) ++ return -EINVAL; ++ ++ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); ++ meta_sk = mptcp_hash_find(genl_info_net(info), token); ++ if (!meta_sk) ++ /* We use a more specific value than EINVAL here so that ++ * userspace can handle this specific case easily. This is ++ * useful to check the case in which userspace tries to create a ++ * subflow for a connection which was already destroyed recently ++ * in kernelspace, but userspace didn't have time to realize ++ * about it because there is a gap of time between kernel ++ * destroying the connection and userspace receiving the event ++ * through Netlink. It can easily happen for short life-time ++ * conns. ++ */ ++ return -EBADR; ++ ++ mpcb = tcp_sk(meta_sk)->mpcb; ++ ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ if (sock_flag(meta_sk, SOCK_DEAD)) { ++ /* Same as for the EBADR case. In this case, though, we know for ++ * sure the conn owner of the subflow existed at some point (no ++ * invalid token possibility) ++ */ ++ ret = -EOWNERDEAD; ++ goto unlock; ++ } ++ ++ if (!mptcp_can_new_subflow(meta_sk)) { ++ /* Same as for the EBADR and EOWNERDEAD case but here, the MPTCP ++ * session has just been stopped, it is no longer possible to ++ * create new subflows. ++ */ ++ ret = -ENOTCONN; ++ goto unlock; ++ } ++ ++ if (mpcb->master_sk && ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) { ++ /* First condition is not only in there for safely purposes, it ++ * can also be triggered in the same scenario as in EBADR and ++ * EOWNERDEAD ++ */ ++ ret = -EAGAIN; ++ goto unlock; ++ } ++ ++ priv = mptcp_nl_priv(meta_sk); ++ ++ family = nla_get_u16(info->attrs[MPTCP_ATTR_FAMILY]); ++ loc_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]); ++ rem_id = nla_get_u8(info->attrs[MPTCP_ATTR_REM_ID]); ++ ++ sport = info->attrs[MPTCP_ATTR_SPORT] ++ ? htons(nla_get_u16(info->attrs[MPTCP_ATTR_SPORT])) : 0; ++ backup = info->attrs[MPTCP_ATTR_BACKUP] ++ ? nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]) : 0; ++ if_idx = info->attrs[MPTCP_ATTR_IF_IDX] ++ ? nla_get_s32(info->attrs[MPTCP_ATTR_IF_IDX]) : 0; ++ ++ switch (family) { ++ case AF_INET: { ++ struct mptcp_rem4 rem = { ++ .rem4_id = rem_id, ++ }; ++ struct mptcp_loc4 loc = { ++ .loc4_id = loc_id, ++ }; ++ ++ if (!info->attrs[MPTCP_ATTR_DADDR4] || ++ !info->attrs[MPTCP_ATTR_DPORT]) { ++ goto create_failed; ++ } else { ++ rem.addr.s_addr = ++ nla_get_u32(info->attrs[MPTCP_ATTR_DADDR4]); ++ rem.port = ++ ntohs(nla_get_u16(info->attrs[MPTCP_ATTR_DPORT])); ++ } ++ ++ if (!info->attrs[MPTCP_ATTR_SADDR4]) { ++ bool found = false; ++ ++ mptcp_for_each_bit_set(priv->loc4_bits, i) { ++ if (priv->locaddr4[i].loc4_id == loc_id) { ++ loc.addr = priv->locaddr4[i].addr; ++ loc.low_prio = ++ priv->locaddr4[i].low_prio; ++ loc.if_idx = ++ priv->locaddr4[i].if_idx; ++ found = true; ++ break; ++ } ++ } ++ ++ if (!found) ++ goto create_failed; ++ } else { ++ loc.addr.s_addr = ++ nla_get_u32(info->attrs[MPTCP_ATTR_SADDR4]); ++ loc.low_prio = backup; ++ loc.if_idx = if_idx; ++ } ++ ++ ret = __mptcp_init4_subsockets(meta_sk, &loc, sport, &rem, ++ &subsk); ++ if (ret < 0) ++ goto unlock; ++ break; ++ } ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: { ++ struct mptcp_rem6 rem = { ++ .rem6_id = rem_id, ++ }; ++ struct mptcp_loc6 loc = { ++ .loc6_id = loc_id, ++ }; ++ ++ if (!info->attrs[MPTCP_ATTR_DADDR6] || ++ !info->attrs[MPTCP_ATTR_DPORT]) { ++ goto create_failed; ++ } else { ++ rem.addr = *(struct in6_addr *) ++ nla_data(info->attrs[MPTCP_ATTR_DADDR6]); ++ rem.port = ++ ntohs(nla_get_u16(info->attrs[MPTCP_ATTR_DPORT])); ++ } ++ ++ if (!info->attrs[MPTCP_ATTR_SADDR6]) { ++ bool found = false; ++ ++ mptcp_for_each_bit_set(priv->loc6_bits, i) { ++ if (priv->locaddr6[i].loc6_id == loc_id) { ++ loc.addr = priv->locaddr6[i].addr; ++ loc.low_prio = ++ priv->locaddr6[i].low_prio; ++ loc.if_idx = ++ priv->locaddr6[i].if_idx; ++ ++ found = true; ++ break; ++ } ++ } ++ ++ if (!found) ++ goto create_failed; ++ } else { ++ loc.addr = *(struct in6_addr *) ++ nla_data(info->attrs[MPTCP_ATTR_SADDR6]); ++ loc.low_prio = backup; ++ loc.if_idx = if_idx; ++ } ++ ++ ret = __mptcp_init6_subsockets(meta_sk, &loc, sport, &rem, ++ &subsk); ++ if (ret < 0) ++ goto unlock; ++ break; ++ } ++#endif ++ default: ++ goto create_failed; ++ } ++ ++unlock: ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ sock_put(meta_sk); ++ return ret; ++ ++create_failed: ++ ret = -EINVAL; ++ goto unlock; ++} ++ ++static struct sock * ++mptcp_nl_subsk_lookup(struct mptcp_cb *mpcb, struct nlattr **attrs) ++{ ++ struct sock *sk; ++ struct mptcp_tcp_sock *mptcp; ++ struct hlist_node *tmp; ++ u16 family; ++ __be16 sport, dport; ++ ++ if (!attrs[MPTCP_ATTR_FAMILY] || !attrs[MPTCP_ATTR_SPORT] || ++ !attrs[MPTCP_ATTR_DPORT]) ++ goto exit; ++ ++ family = nla_get_u16(attrs[MPTCP_ATTR_FAMILY]); ++ sport = htons(nla_get_u16(attrs[MPTCP_ATTR_SPORT])); ++ dport = htons(nla_get_u16(attrs[MPTCP_ATTR_DPORT])); ++ ++ switch (family) { ++ case AF_INET: { ++ __be32 saddr, daddr; ++ ++ if (!attrs[MPTCP_ATTR_SADDR4] || !attrs[MPTCP_ATTR_DADDR4]) ++ break; ++ ++ saddr = nla_get_u32(attrs[MPTCP_ATTR_SADDR4]); ++ daddr = nla_get_u32(attrs[MPTCP_ATTR_DADDR4]); ++ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *subsk = mptcp_to_sock(mptcp); ++ struct inet_sock *isk = inet_sk(subsk); ++ ++ if (subsk->sk_family != AF_INET) ++ continue; ++ ++ if (isk->inet_saddr == saddr && ++ isk->inet_daddr == daddr && ++ isk->inet_sport == sport && ++ isk->inet_dport == dport) { ++ sk = subsk; ++ goto found; ++ } ++ } ++ break; ++ } ++#if IS_ENABLED(CONFIG_IPV6) ++ case AF_INET6: { ++ struct in6_addr saddr, daddr; ++ ++ if (!attrs[MPTCP_ATTR_SADDR6] || !attrs[MPTCP_ATTR_DADDR6]) ++ break; ++ ++ saddr = *(struct in6_addr *)nla_data(attrs[MPTCP_ATTR_SADDR6]); ++ daddr = *(struct in6_addr *)nla_data(attrs[MPTCP_ATTR_DADDR6]); ++ ++ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { ++ struct sock *subsk = mptcp_to_sock(mptcp); ++ struct inet_sock *isk = inet_sk(subsk); ++ struct ipv6_pinfo *np; ++ ++ if (subsk->sk_family != AF_INET6) ++ continue; ++ ++ np = inet6_sk(subsk); ++ if (ipv6_addr_equal(&saddr, &np->saddr) && ++ ipv6_addr_equal(&daddr, &subsk->sk_v6_daddr) && ++ isk->inet_sport == sport && ++ isk->inet_dport == dport) { ++ sk = subsk; ++ goto found; ++ } ++ } ++ break; ++ } ++#endif ++ } ++ ++exit: ++ sk = NULL; ++found: ++ return sk; ++} ++ ++static int ++mptcp_nl_genl_destroy(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct sock *meta_sk, *subsk; ++ struct mptcp_cb *mpcb; ++ int ret = 0; ++ u32 token; ++ ++ if (!info->attrs[MPTCP_ATTR_TOKEN]) ++ return -EINVAL; ++ ++ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); ++ ++ meta_sk = mptcp_hash_find(genl_info_net(info), token); ++ if (!meta_sk) ++ return -EINVAL; ++ ++ mpcb = tcp_sk(meta_sk)->mpcb; ++ ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ subsk = mptcp_nl_subsk_lookup(mpcb, info->attrs); ++ if (subsk) { ++ local_bh_disable(); ++ mptcp_reinject_data(subsk, 0); ++ mptcp_send_reset(subsk); ++ local_bh_enable(); ++ } else { ++ ret = -EINVAL; ++ } ++ ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ sock_put(meta_sk); ++ return ret; ++} ++ ++static int ++mptcp_nl_genl_conn_exists(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct sock *meta_sk; ++ u32 token; ++ ++ if (!info->attrs[MPTCP_ATTR_TOKEN]) ++ return -EINVAL; ++ ++ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); ++ ++ meta_sk = mptcp_hash_find(genl_info_net(info), token); ++ if (!meta_sk) ++ return -ENOTCONN; ++ ++ sock_put(meta_sk); ++ return 0; ++} ++ ++static int ++mptcp_nl_genl_priority(struct sk_buff *skb, struct genl_info *info) ++{ ++ struct sock *meta_sk, *subsk; ++ struct mptcp_cb *mpcb; ++ int ret = 0; ++ u32 token; ++ u8 backup = 0; ++ ++ if (!info->attrs[MPTCP_ATTR_TOKEN]) ++ return -EINVAL; ++ ++ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); ++ if (info->attrs[MPTCP_ATTR_BACKUP]) ++ backup = nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]); ++ ++ meta_sk = mptcp_hash_find(genl_info_net(info), token); ++ if (!meta_sk) ++ return -EINVAL; ++ ++ mpcb = tcp_sk(meta_sk)->mpcb; ++ ++ mutex_lock(&mpcb->mpcb_mutex); ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); ++ ++ subsk = mptcp_nl_subsk_lookup(mpcb, info->attrs); ++ if (subsk) { ++ tcp_sk(subsk)->mptcp->send_mp_prio = 1; ++ tcp_sk(subsk)->mptcp->low_prio = !!backup; ++ ++ local_bh_disable(); ++ if (mptcp_sk_can_send_ack(subsk)) ++ tcp_send_ack(subsk); ++ else ++ ret = -ENOTCONN; ++ local_bh_enable(); ++ } else { ++ ret = -EINVAL; ++ } ++ ++ release_sock(meta_sk); ++ mutex_unlock(&mpcb->mpcb_mutex); ++ sock_put(meta_sk); ++ return ret; ++} ++ ++static int ++mptcp_nl_genl_set_filter(struct sk_buff *skb, struct genl_info *info) ++{ ++ u16 flags; ++ ++ if (!info->attrs[MPTCP_ATTR_FLAGS]) ++ return -EINVAL; ++ ++ flags = nla_get_u16(info->attrs[MPTCP_ATTR_FLAGS]); ++ ++ /* Only want to receive events that correspond to these flags */ ++ mptcp_nl_event_filter = ~flags; ++ ++ return 0; ++} ++ ++static struct genl_ops mptcp_genl_ops[] = { ++ { ++ .cmd = MPTCP_CMD_ANNOUNCE, ++ .doit = mptcp_nl_genl_announce, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MPTCP_CMD_REMOVE, ++ .doit = mptcp_nl_genl_remove, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MPTCP_CMD_SUB_CREATE, ++ .doit = mptcp_nl_genl_create, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MPTCP_CMD_SUB_DESTROY, ++ .doit = mptcp_nl_genl_destroy, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MPTCP_CMD_SUB_PRIORITY, ++ .doit = mptcp_nl_genl_priority, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MPTCP_CMD_SET_FILTER, ++ .doit = mptcp_nl_genl_set_filter, ++ .flags = GENL_ADMIN_PERM, ++ }, ++ { ++ .cmd = MPTCP_CMD_EXIST, ++ .doit = mptcp_nl_genl_conn_exists, ++ .flags = GENL_ADMIN_PERM, ++ }, ++}; ++ ++static struct mptcp_pm_ops mptcp_nl_pm_ops = { ++ .new_session = mptcp_nl_pm_new_session, ++ .close_session = mptcp_nl_pm_close_session, ++ .fully_established = mptcp_nl_pm_fully_established, ++ .established_subflow = mptcp_nl_pm_established_subflow, ++ .delete_subflow = mptcp_nl_pm_delete_subflow, ++ .add_raddr = mptcp_nl_pm_add_raddr, ++ .rem_raddr = mptcp_nl_pm_rem_raddr, ++ .get_local_id = mptcp_nl_pm_get_local_id, ++ .addr_signal = mptcp_nl_pm_addr_signal, ++ .prio_changed = mptcp_nl_pm_prio_changed, ++ .name = "netlink", ++ .owner = THIS_MODULE, ++}; ++ ++static struct genl_family mptcp_genl_family = { ++ .hdrsize = 0, ++ .name = MPTCP_GENL_NAME, ++ .version = MPTCP_GENL_VER, ++ .maxattr = MPTCP_ATTR_MAX, ++ .policy = mptcp_nl_genl_policy, ++ .netnsok = true, ++ .module = THIS_MODULE, ++ .ops = mptcp_genl_ops, ++ .n_ops = ARRAY_SIZE(mptcp_genl_ops), ++ .mcgrps = mptcp_mcgrps, ++ .n_mcgrps = ARRAY_SIZE(mptcp_mcgrps), ++}; ++ ++static int __init ++mptcp_nl_init(void) ++{ ++ int ret; ++ ++ BUILD_BUG_ON(sizeof(struct mptcp_nl_priv) > MPTCP_PM_SIZE); ++ ++ ret = genl_register_family(&mptcp_genl_family); ++ if (ret) ++ goto out_genl; ++ ++ ret = mptcp_register_path_manager(&mptcp_nl_pm_ops); ++ if (ret) ++ goto out_pm; ++ ++ return 0; ++out_pm: ++ genl_unregister_family(&mptcp_genl_family); ++out_genl: ++ return ret; ++} ++ ++static void __exit ++mptcp_nl_exit(void) ++{ ++ mptcp_unregister_path_manager(&mptcp_nl_pm_ops); ++ genl_unregister_family(&mptcp_genl_family); ++} ++ ++module_init(mptcp_nl_init); ++module_exit(mptcp_nl_exit); ++ ++MODULE_AUTHOR("Gregory Detal "); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MPTCP netlink-based path manager"); ++MODULE_ALIAS_GENL_FAMILY(MPTCP_GENL_NAME); +diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c +new file mode 100644 +index 000000000000..c44eb9208581 +--- /dev/null ++++ b/net/mptcp/mptcp_olia.c +@@ -0,0 +1,318 @@ ++/* ++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL: ++ * ++ * Algorithm design: ++ * Ramin Khalili ++ * Nicolas Gast ++ * Jean-Yves Le Boudec ++ * ++ * Implementation: ++ * Ramin Khalili ++ * ++ * Ported to the official MPTCP-kernel: ++ * Christoph Paasch ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++ ++#include ++#include ++ ++#include ++ ++static int scale = 10; ++ ++struct mptcp_olia { ++ u32 mptcp_loss1; ++ u32 mptcp_loss2; ++ u32 mptcp_loss3; ++ int epsilon_num; ++ u32 epsilon_den; ++ int mptcp_snd_cwnd_cnt; ++}; ++ ++static inline int mptcp_olia_sk_can_send(const struct sock *sk) ++{ ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; ++} ++ ++static inline u64 mptcp_olia_scale(u64 val, int scale) ++{ ++ return (u64) val << scale; ++} ++ ++/* take care of artificially inflate (see RFC5681) ++ * of cwnd during fast-retransmit phase ++ */ ++static u32 mptcp_get_crt_cwnd(struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ if (icsk->icsk_ca_state == TCP_CA_Recovery) ++ return tcp_sk(sk)->snd_ssthresh; ++ else ++ return tcp_sk(sk)->snd_cwnd; ++} ++ ++/* return the dominator of the first term of the increasing term */ ++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */ ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ u64 scaled_num; ++ u32 tmp_cwnd; ++ ++ if (!mptcp_olia_sk_can_send(sk)) ++ continue; ++ ++ tmp_cwnd = mptcp_get_crt_cwnd(sk); ++ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt; ++ rate += div_u64(scaled_num , tp->srtt_us); ++ } ++ rate *= rate; ++ return rate; ++} ++ ++/* find the maximum cwnd, used to find set M */ ++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ u32 best_cwnd = 0; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ u32 tmp_cwnd; ++ ++ if (!mptcp_olia_sk_can_send(sk)) ++ continue; ++ ++ tmp_cwnd = mptcp_get_crt_cwnd(sk); ++ if (tmp_cwnd > best_cwnd) ++ best_cwnd = tmp_cwnd; ++ } ++ return best_cwnd; ++} ++ ++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ struct mptcp_olia *ca; ++ struct tcp_sock *tp; ++ struct sock *sk; ++ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1; ++ u32 max_cwnd, tmp_cwnd, established_cnt = 0; ++ u8 M = 0, B_not_M = 0; ++ ++ /* TODO - integrate this in the following loop - we just want to iterate once */ ++ ++ max_cwnd = mptcp_get_max_cwnd(mpcb); ++ ++ /* find the best path */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ sk = mptcp_to_sock(mptcp); ++ tp = tcp_sk(sk); ++ ca = inet_csk_ca(sk); ++ ++ if (!mptcp_olia_sk_can_send(sk)) ++ continue; ++ ++ established_cnt++; ++ ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; ++ /* TODO - check here and rename variables */ ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, ++ ca->mptcp_loss2 - ca->mptcp_loss1); ++ ++ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) { ++ best_rtt = tmp_rtt; ++ best_int = tmp_int; ++ } ++ } ++ ++ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */ ++ /* find the size of M and B_not_M */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ sk = mptcp_to_sock(mptcp); ++ tp = tcp_sk(sk); ++ ca = inet_csk_ca(sk); ++ ++ if (!mptcp_olia_sk_can_send(sk)) ++ continue; ++ ++ tmp_cwnd = mptcp_get_crt_cwnd(sk); ++ if (tmp_cwnd == max_cwnd) { ++ M++; ++ } else { ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, ++ ca->mptcp_loss2 - ca->mptcp_loss1); ++ ++ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) ++ B_not_M++; ++ } ++ } ++ ++ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ sk = mptcp_to_sock(mptcp); ++ tp = tcp_sk(sk); ++ ca = inet_csk_ca(sk); ++ ++ if (!mptcp_olia_sk_can_send(sk)) ++ continue; ++ ++ if (B_not_M == 0) { ++ ca->epsilon_num = 0; ++ ca->epsilon_den = 1; ++ } else { ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, ++ ca->mptcp_loss2 - ca->mptcp_loss1); ++ tmp_cwnd = mptcp_get_crt_cwnd(sk); ++ ++ if (tmp_cwnd < max_cwnd && ++ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) { ++ ca->epsilon_num = 1; ++ ca->epsilon_den = established_cnt * B_not_M; ++ } else if (tmp_cwnd == max_cwnd) { ++ ca->epsilon_num = -1; ++ ca->epsilon_den = established_cnt * M; ++ } else { ++ ca->epsilon_num = 0; ++ ca->epsilon_den = 1; ++ } ++ } ++ } ++} ++ ++/* setting the initial values */ ++static void mptcp_olia_init(struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_olia *ca = inet_csk_ca(sk); ++ ++ if (mptcp(tp)) { ++ ca->mptcp_loss1 = tp->snd_una; ++ ca->mptcp_loss2 = tp->snd_una; ++ ca->mptcp_loss3 = tp->snd_una; ++ ca->mptcp_snd_cwnd_cnt = 0; ++ ca->epsilon_num = 0; ++ ca->epsilon_den = 1; ++ } ++} ++ ++/* updating inter-loss distance and ssthresh */ ++static void mptcp_olia_set_state(struct sock *sk, u8 new_state) ++{ ++ if (!mptcp(tcp_sk(sk))) ++ return; ++ ++ if (new_state == TCP_CA_Loss || ++ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) { ++ struct mptcp_olia *ca = inet_csk_ca(sk); ++ ++ if (ca->mptcp_loss3 != ca->mptcp_loss2 && ++ !inet_csk(sk)->icsk_retransmits) { ++ ca->mptcp_loss1 = ca->mptcp_loss2; ++ ca->mptcp_loss2 = ca->mptcp_loss3; ++ } ++ } ++} ++ ++/* main algorithm */ ++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_olia *ca = inet_csk_ca(sk); ++ const struct mptcp_cb *mpcb = tp->mpcb; ++ ++ u64 inc_num, inc_den, rate, cwnd_scaled; ++ ++ if (!mptcp(tp)) { ++ tcp_reno_cong_avoid(sk, ack, acked); ++ return; ++ } ++ ++ ca->mptcp_loss3 = tp->snd_una; ++ ++ if (!tcp_is_cwnd_limited(sk)) ++ return; ++ ++ /* slow start if it is in the safe area */ ++ if (tcp_in_slow_start(tp)) { ++ tcp_slow_start(tp, acked); ++ return; ++ } ++ ++ mptcp_get_epsilon(mpcb); ++ rate = mptcp_get_rate(mpcb, tp->srtt_us); ++ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale); ++ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1; ++ ++ /* calculate the increasing term, scaling is used to reduce the rounding effect */ ++ if (ca->epsilon_num == -1) { ++ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) { ++ inc_num = rate - ca->epsilon_den * ++ cwnd_scaled * cwnd_scaled; ++ ca->mptcp_snd_cwnd_cnt -= div64_u64( ++ mptcp_olia_scale(inc_num , scale) , inc_den); ++ } else { ++ inc_num = ca->epsilon_den * ++ cwnd_scaled * cwnd_scaled - rate; ++ ca->mptcp_snd_cwnd_cnt += div64_u64( ++ mptcp_olia_scale(inc_num , scale) , inc_den); ++ } ++ } else { ++ inc_num = ca->epsilon_num * rate + ++ ca->epsilon_den * cwnd_scaled * cwnd_scaled; ++ ca->mptcp_snd_cwnd_cnt += div64_u64( ++ mptcp_olia_scale(inc_num , scale) , inc_den); ++ } ++ ++ ++ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) { ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) ++ tp->snd_cwnd++; ++ ca->mptcp_snd_cwnd_cnt = 0; ++ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) { ++ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1); ++ ca->mptcp_snd_cwnd_cnt = 0; ++ } ++} ++ ++static struct tcp_congestion_ops mptcp_olia = { ++ .init = mptcp_olia_init, ++ .ssthresh = tcp_reno_ssthresh, ++ .cong_avoid = mptcp_olia_cong_avoid, ++ .undo_cwnd = tcp_reno_undo_cwnd, ++ .set_state = mptcp_olia_set_state, ++ .owner = THIS_MODULE, ++ .name = "olia", ++}; ++ ++static int __init mptcp_olia_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE); ++ return tcp_register_congestion_control(&mptcp_olia); ++} ++ ++static void __exit mptcp_olia_unregister(void) ++{ ++ tcp_unregister_congestion_control(&mptcp_olia); ++} ++ ++module_init(mptcp_olia_register); ++module_exit(mptcp_olia_unregister); ++ ++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); ++MODULE_VERSION("0.1"); +diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c +new file mode 100644 +index 000000000000..39eae2199802 +--- /dev/null ++++ b/net/mptcp/mptcp_output.c +@@ -0,0 +1,2009 @@ ++/* ++ * MPTCP implementation - Sending side ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN + ++ MPTCP_SUB_LEN_ACK_ALIGN + ++ MPTCP_SUB_LEN_SEQ_ALIGN; ++ ++static inline int mptcp_sub_len_remove_addr(u16 bitfield) ++{ ++ unsigned int c; ++ for (c = 0; bitfield; c++) ++ bitfield &= bitfield - 1; ++ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1; ++} ++ ++int mptcp_sub_len_remove_addr_align(u16 bitfield) ++{ ++ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4); ++} ++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align); ++ ++/* get the data-seq and end-data-seq and store them again in the ++ * tcp_skb_cb ++ */ ++static bool mptcp_reconstruct_mapping(struct sk_buff *skb) ++{ ++ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss; ++ __be32 *p32; ++ __be16 *p16; ++ ++ if (!mptcp_is_data_seq(skb)) ++ return false; ++ ++ if (!mpdss->M) ++ return false; ++ ++ /* Move the pointer to the data-seq */ ++ p32 = (__be32 *)mpdss; ++ p32++; ++ if (mpdss->A) { ++ p32++; ++ if (mpdss->a) ++ p32++; ++ } ++ ++ TCP_SKB_CB(skb)->seq = ntohl(*p32); ++ ++ /* Get the data_len to calculate the end_data_seq */ ++ p32++; ++ p32++; ++ p16 = (__be16 *)p32; ++ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; ++ ++ return true; ++} ++ ++static bool mptcp_is_reinjected(const struct sk_buff *skb) ++{ ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCP_REINJECT; ++} ++ ++static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ struct rb_node **p = &meta_sk->tcp_rtx_queue.rb_node; ++ struct rb_node *parent; ++ struct sk_buff *skb_it; ++ ++ while (*p) { ++ parent = *p; ++ skb_it = rb_to_skb(parent); ++ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb_it)->seq)) { ++ p = &parent->rb_left; ++ continue; ++ } ++ if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb_it)->seq)) { ++ p = &parent->rb_right; ++ continue; ++ } ++ ++ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; ++ break; ++ } ++} ++ ++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are ++ * coming from the meta-retransmit-timer ++ */ ++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, ++ struct sock *sk, int clone_it, ++ enum tcp_queue tcp_queue) ++{ ++ struct sk_buff *skb, *skb1; ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ u32 seq, end_seq; ++ ++ if (clone_it) { ++ /* pskb_copy is necessary here, because the TCP/IP-headers ++ * will be changed when it's going to be reinjected on another ++ * subflow. ++ */ ++ tcp_skb_tsorted_save(orig_skb) { ++ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC); ++ } tcp_skb_tsorted_restore(orig_skb); ++ } else { ++ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) { ++ __skb_unlink(orig_skb, &sk->sk_write_queue); ++ } else { ++ list_del(&orig_skb->tcp_tsorted_anchor); ++ tcp_rtx_queue_unlink(orig_skb, sk); ++ INIT_LIST_HEAD(&orig_skb->tcp_tsorted_anchor); ++ } ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); ++ sk->sk_wmem_queued -= orig_skb->truesize; ++ sk_mem_uncharge(sk, orig_skb->truesize); ++ skb = orig_skb; ++ } ++ if (unlikely(!skb)) ++ return; ++ ++ /* Make sure that this list is clean */ ++ tcp_skb_tsorted_anchor_cleanup(skb); ++ ++ if (sk && !mptcp_reconstruct_mapping(skb)) { ++ __kfree_skb(skb); ++ return; ++ } ++ ++ skb->sk = meta_sk; ++ ++ /* Reset subflow-specific TCP control-data */ ++ TCP_SKB_CB(skb)->sacked = 0; ++ TCP_SKB_CB(skb)->tcp_flags &= (TCPHDR_ACK | TCPHDR_PSH); ++ ++ /* If it reached already the destination, we don't have to reinject it */ ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { ++ __kfree_skb(skb); ++ return; ++ } ++ ++ /* Only reinject segments that are fully covered by the mapping */ ++ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != ++ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { ++ struct rb_node *parent, **p = &meta_sk->tcp_rtx_queue.rb_node; ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq; ++ u32 seq = TCP_SKB_CB(skb)->seq; ++ ++ __kfree_skb(skb); ++ ++ /* Ok, now we have to look for the full mapping in the meta ++ * send-queue :S ++ */ ++ ++ /* First, find the first skb that covers us */ ++ while (*p) { ++ parent = *p; ++ skb = rb_to_skb(parent); ++ ++ /* Not yet at the mapping? */ ++ if (!after(end_seq, TCP_SKB_CB(skb)->seq)) { ++ p = &parent->rb_left; ++ continue; ++ } ++ ++ if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { ++ p = &parent->rb_right; ++ continue; ++ } ++ ++ break; ++ } ++ ++ if (*p) { ++ /* We found it, now let's reinject everything */ ++ skb = rb_to_skb(*p); ++ ++ skb_rbtree_walk_from(skb) { ++ if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) ++ return; ++ __mptcp_reinject_data(skb, meta_sk, NULL, 1, ++ TCP_FRAG_IN_RTX_QUEUE); ++ } ++ } ++ return; ++ } ++ ++ /* Segment goes back to the MPTCP-layer. So, we need to zero the ++ * path_mask/dss. ++ */ ++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); ++ ++ /* We need to find out the path-mask from the meta-write-queue ++ * to properly select a subflow. ++ */ ++ mptcp_find_and_set_pathmask(meta_sk, skb); ++ ++ /* If it's empty, just add */ ++ if (skb_queue_empty(&mpcb->reinject_queue)) { ++ skb_queue_head(&mpcb->reinject_queue, skb); ++ return; ++ } ++ ++ /* Find place to insert skb - or even we can 'drop' it, as the ++ * data is already covered by other skb's in the reinject-queue. ++ * ++ * This is inspired by code from tcp_data_queue. ++ */ ++ ++ skb1 = skb_peek_tail(&mpcb->reinject_queue); ++ seq = TCP_SKB_CB(skb)->seq; ++ while (1) { ++ if (!after(TCP_SKB_CB(skb1)->seq, seq)) ++ break; ++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { ++ skb1 = NULL; ++ break; ++ } ++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); ++ } ++ ++ /* Do skb overlap to previous one? */ ++ end_seq = TCP_SKB_CB(skb)->end_seq; ++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { ++ /* All the bits are present. Don't reinject */ ++ __kfree_skb(skb); ++ return; ++ } ++ if (seq == TCP_SKB_CB(skb1)->seq) { ++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) ++ skb1 = NULL; ++ else ++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); ++ } ++ } ++ if (!skb1) ++ __skb_queue_head(&mpcb->reinject_queue, skb); ++ else ++ __skb_queue_after(&mpcb->reinject_queue, skb1, skb); ++ ++ /* And clean segments covered by new one as whole. */ ++ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { ++ skb1 = skb_queue_next(&mpcb->reinject_queue, skb); ++ ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) ++ break; ++ ++ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) ++ break; ++ ++ __skb_unlink(skb1, &mpcb->reinject_queue); ++ __kfree_skb(skb1); ++ } ++ return; ++} ++ ++/* Inserts data into the reinject queue */ ++void mptcp_reinject_data(struct sock *sk, int clone_it) ++{ ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct sk_buff *skb_it, *tmp; ++ enum tcp_queue tcp_queue; ++ ++ /* It has already been closed - there is really no point in reinjecting */ ++ if (meta_sk->sk_state == TCP_CLOSE) ++ return; ++ ++ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); ++ /* Subflow syn's and fin's are not reinjected. ++ * ++ * As well as empty subflow-fins with a data-fin. ++ * They are reinjected below (without the subflow-fin-flag) ++ */ ++ if (tcb->tcp_flags & TCPHDR_SYN || ++ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || ++ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) ++ continue; ++ ++ if (mptcp_is_reinjected(skb_it)) ++ continue; ++ ++ tcb->mptcp_flags |= MPTCP_REINJECT; ++ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it, ++ TCP_FRAG_IN_WRITE_QUEUE); ++ } ++ ++ skb_it = tcp_rtx_queue_head(sk); ++ skb_rbtree_walk_from_safe(skb_it, tmp) { ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); ++ ++ /* Subflow syn's and fin's are not reinjected. ++ * ++ * As well as empty subflow-fins with a data-fin. ++ * They are reinjected below (without the subflow-fin-flag) ++ */ ++ if (tcb->tcp_flags & TCPHDR_SYN || ++ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || ++ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) ++ continue; ++ ++ if (mptcp_is_reinjected(skb_it)) ++ continue; ++ ++ tcb->mptcp_flags |= MPTCP_REINJECT; ++ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it, ++ TCP_FRAG_IN_RTX_QUEUE); ++ } ++ ++ skb_it = tcp_write_queue_tail(meta_sk); ++ tcp_queue = TCP_FRAG_IN_WRITE_QUEUE; ++ ++ if (!skb_it) { ++ skb_it = skb_rb_last(&meta_sk->tcp_rtx_queue); ++ tcp_queue = TCP_FRAG_IN_RTX_QUEUE; ++ } ++ ++ /* If sk has sent the empty data-fin, we have to reinject it too. */ ++ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && ++ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tcp_sk(sk)->mptcp->path_index)) { ++ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1, tcp_queue); ++ } ++ ++ tcp_sk(sk)->pf = 1; ++ ++ mptcp_push_pending_frames(meta_sk); ++} ++EXPORT_SYMBOL(mptcp_reinject_data); ++ ++static void mptcp_combine_dfin(const struct sk_buff *skb, ++ const struct sock *meta_sk, ++ struct sock *subsk) ++{ ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ const struct mptcp_cb *mpcb = meta_tp->mpcb; ++ ++ /* In infinite mapping we always try to combine */ ++ if (mpcb->infinite_mapping_snd) ++ goto combine; ++ ++ /* Don't combine, if they didn't combine when closing - otherwise we end ++ * up in TIME_WAIT, even if our app is smart enough to avoid it. ++ */ ++ if (!mptcp_sk_can_recv(meta_sk) && !mpcb->dfin_combined) ++ return; ++ ++ /* Don't combine if there is still outstanding data that remains to be ++ * DATA_ACKed, because otherwise we may never be able to deliver this. ++ */ ++ if (meta_tp->snd_una != TCP_SKB_CB(skb)->seq) ++ return; ++ ++combine: ++ if (tcp_close_state(subsk)) { ++ subsk->sk_shutdown |= SEND_SHUTDOWN; ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; ++ } ++} ++ ++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb, ++ __be32 *ptr) ++{ ++ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ++ __be32 *start = ptr; ++ __u16 data_len; ++ ++ *ptr++ = htonl(tcb->seq); /* data_seq */ ++ ++ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ ++ if (mptcp_is_data_fin(skb) && skb->len == 0) ++ *ptr++ = 0; /* subseq */ ++ else ++ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ ++ ++ if (tcb->mptcp_flags & MPTCPHDR_INF) ++ data_len = 0; ++ else ++ data_len = tcb->end_seq - tcb->seq; ++ ++ if (tp->mpcb->dss_csum && data_len) { ++ __sum16 *p16 = (__sum16 *)ptr; ++ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb); ++ __wsum csum; ++ ++ *ptr = htonl(((data_len) << 16) | ++ (TCPOPT_EOL << 8) | ++ (TCPOPT_EOL)); ++ csum = csum_partial(ptr - 2, 12, skb->csum); ++ p16++; ++ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); ++ } else { ++ *ptr++ = htonl(((data_len) << 16) | ++ (TCPOPT_NOP << 8) | ++ (TCPOPT_NOP)); ++ } ++ ++ return ptr - start; ++} ++ ++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb, ++ __be32 *ptr) ++{ ++ struct mp_dss *mdss = (struct mp_dss *)ptr; ++ __be32 *start = ptr; ++ ++ mdss->kind = TCPOPT_MPTCP; ++ mdss->sub = MPTCP_SUB_DSS; ++ mdss->rsv1 = 0; ++ mdss->rsv2 = 0; ++ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; ++ mdss->m = 0; ++ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; ++ mdss->a = 0; ++ mdss->A = 1; ++ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); ++ ptr++; ++ ++ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt); ++ ++ return ptr - start; ++} ++ ++/* RFC6824 states that once a particular subflow mapping has been sent ++ * out it must never be changed. However, packets may be split while ++ * they are in the retransmission queue (due to SACK or ACKs) and that ++ * arguably means that we would change the mapping (e.g. it splits it, ++ * our sends out a subset of the initial mapping). ++ * ++ * Furthermore, the skb checksum is not always preserved across splits ++ * (e.g. mptcp_fragment) which would mean that we need to recompute ++ * the DSS checksum in this case. ++ * ++ * To avoid this we save the initial DSS mapping which allows us to ++ * send the same DSS mapping even for fragmented retransmits. ++ */ ++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb) ++{ ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); ++ __be32 *ptr = (__be32 *)tcb->dss; ++ ++ tcb->mptcp_flags |= MPTCPHDR_SEQ; ++ ++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr); ++ ptr += mptcp_write_dss_mapping(tp, skb, ptr); ++} ++ ++/* Write the MP_CAPABLE with data-option */ ++static int mptcp_write_mpcapable_data(const struct tcp_sock *tp, ++ struct sk_buff *skb, ++ __be32 *ptr) ++{ ++ struct mp_capable *mpc = (struct mp_capable *)ptr; ++ u8 length; ++ ++ if (tp->mpcb->dss_csum) ++ length = MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM; ++ else ++ length = MPTCPV1_SUB_LEN_CAPABLE_DATA; ++ ++ mpc->kind = TCPOPT_MPTCP; ++ mpc->len = length; ++ mpc->sub = MPTCP_SUB_CAPABLE; ++ mpc->ver = MPTCP_VERSION_1; ++ mpc->a = tp->mpcb->dss_csum; ++ mpc->b = 0; ++ mpc->rsv = 0; ++ mpc->h = 1; ++ ++ ptr++; ++ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); ++ ++ mpc->sender_key = tp->mpcb->mptcp_loc_key; ++ mpc->receiver_key = tp->mpcb->mptcp_rem_key; ++ ++ /* dss is in a union with inet_skb_parm and ++ * the IP layer expects zeroed IPCB fields. ++ */ ++ memset(TCP_SKB_CB(skb)->dss, 0, mptcp_dss_len); ++ ++ return MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN / sizeof(*ptr); ++} ++ ++/* Write the saved DSS mapping to the header */ ++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb, ++ __be32 *ptr) ++{ ++ int length; ++ __be32 *start = ptr; ++ ++ if (tp->mpcb->rem_key_set) { ++ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); ++ ++ /* update the data_ack */ ++ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt); ++ ++ length = mptcp_dss_len / sizeof(*ptr); ++ } else { ++ memcpy(ptr, TCP_SKB_CB(skb)->dss, MPTCP_SUB_LEN_DSS_ALIGN); ++ ++ ptr++; ++ memcpy(ptr, TCP_SKB_CB(skb)->dss + 2, MPTCP_SUB_LEN_SEQ_ALIGN); ++ ++ length = (MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_SEQ_ALIGN) / sizeof(*ptr); ++ } ++ ++ /* dss is in a union with inet_skb_parm and ++ * the IP layer expects zeroed IPCB fields. ++ */ ++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); ++ ++ return length; ++} ++ ++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ const struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ struct tcp_skb_cb *tcb; ++ struct sk_buff *subskb = NULL; ++ ++ if (!reinject) ++ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? ++ MPTCPHDR_SEQ64_INDEX : 0); ++ ++ tcp_skb_tsorted_save(skb) { ++ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC); ++ } tcp_skb_tsorted_restore(skb); ++ if (!subskb) ++ return false; ++ ++ /* At the subflow-level we need to call again tcp_init_tso_segs. We ++ * force this, by setting pcount to 0. It has been set to 1 prior to ++ * the call to mptcp_skb_entail. ++ */ ++ tcp_skb_pcount_set(subskb, 0); ++ ++ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); ++ ++ /* Compute checksum */ ++ if (tp->mpcb->dss_csum) ++ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); ++ ++ tcb = TCP_SKB_CB(subskb); ++ ++ if (tp->mpcb->send_infinite_mapping && ++ !tp->mpcb->infinite_mapping_snd && ++ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { ++ tp->mptcp->fully_established = 1; ++ tp->mpcb->infinite_mapping_snd = 1; ++ tp->mptcp->infinite_cutoff_seq = tp->write_seq; ++ tcb->mptcp_flags |= MPTCPHDR_INF; ++ } ++ ++ if (mptcp_is_data_fin(subskb)) ++ mptcp_combine_dfin(subskb, meta_sk, sk); ++ ++ mptcp_save_dss_data_seq(tp, subskb); ++ ++ if (mpcb->send_mptcpv1_mpcapable) { ++ TCP_SKB_CB(subskb)->mptcp_flags |= MPTCPHDR_MPC_DATA; ++ mpcb->send_mptcpv1_mpcapable = 0; ++ } ++ ++ tcb->seq = tp->write_seq; ++ ++ /* Take into account seg len */ ++ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); ++ tcb->end_seq = tp->write_seq; ++ ++ /* txstamp_ack is handled at the meta-level */ ++ tcb->txstamp_ack = 0; ++ ++ /* If it's a non-payload DATA_FIN (also no subflow-fin), the ++ * segment is not part of the subflow but on a meta-only-level. ++ */ ++ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { ++ /* Make sure that this list is clean */ ++ INIT_LIST_HEAD(&subskb->tcp_tsorted_anchor); ++ ++ tcp_add_write_queue_tail(sk, subskb); ++ sk->sk_wmem_queued += subskb->truesize; ++ sk_mem_charge(sk, subskb->truesize); ++ } else { ++ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as ++ * skb->len = 0 will force tso_segs to 1. ++ */ ++ tcp_init_tso_segs(subskb, 1); ++ ++ /* Empty data-fins are sent immediatly on the subflow */ ++ if (tcp_transmit_skb(sk, subskb, 0, GFP_ATOMIC)) ++ return false; ++ } ++ ++ if (!tp->mptcp->fully_established) { ++ tp->mptcp->second_packet = 1; ++ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; ++ } ++ ++ return true; ++} ++ ++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we ++ * might need to undo some operations done by tcp_fragment. ++ * ++ * Be careful, the skb may come from 3 different places: ++ * - The send-queue (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) ++ * - The retransmit-queue (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) ++ * - The reinject-queue (reinject == -1) ++ */ ++static int mptcp_fragment(struct sock *meta_sk, enum tcp_queue tcp_queue, ++ struct sk_buff *skb, u32 len, ++ gfp_t gfp, int reinject) ++{ ++ int ret, diff, old_factor; ++ struct sk_buff *buff; ++ u8 flags; ++ ++ if (skb_headlen(skb) < len) ++ diff = skb->len - len; ++ else ++ diff = skb->data_len; ++ old_factor = tcp_skb_pcount(skb); ++ ++ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb. ++ * At the MPTCP-level we do not care about the absolute value. All we ++ * care about is that it is set to 1 for accurate packets_out ++ * accounting. ++ */ ++ ret = tcp_fragment(meta_sk, tcp_queue, skb, len, UINT_MAX, gfp); ++ if (ret) ++ return ret; ++ ++ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) ++ buff = skb->next; ++ else ++ buff = skb_rb_next(skb); ++ ++ flags = TCP_SKB_CB(skb)->mptcp_flags; ++ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); ++ TCP_SKB_CB(buff)->mptcp_flags = flags; ++ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask; ++ ++ /* If reinject == 1, the buff will be added to the reinject ++ * queue, which is currently not part of memory accounting. So ++ * undo the changes done by tcp_fragment and update the ++ * reinject queue. Also, undo changes to the packet counters. ++ */ ++ if (reinject == 1) { ++ int undo = buff->truesize - diff; ++ meta_sk->sk_wmem_queued -= undo; ++ sk_mem_uncharge(meta_sk, undo); ++ ++ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++; ++ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) ++ meta_sk->sk_write_queue.qlen--; ++ ++ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { ++ undo = old_factor - tcp_skb_pcount(skb) - ++ tcp_skb_pcount(buff); ++ if (undo) ++ tcp_adjust_pcount(meta_sk, skb, -undo); ++ } ++ ++ /* tcp_fragment's call to sk_stream_alloc_skb initializes the ++ * tcp_tsorted_anchor. We need to revert this as it clashes ++ * with the refdst pointer. ++ */ ++ tcp_skb_tsorted_anchor_cleanup(buff); ++ } ++ ++ return 0; ++} ++ ++/* Inspired by tcp_write_wakeup */ ++int mptcp_write_wakeup(struct sock *meta_sk, int mib) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct sk_buff *skb; ++ int ans = 0; ++ ++ if (meta_sk->sk_state == TCP_CLOSE) ++ return -1; ++ ++ skb = tcp_send_head(meta_sk); ++ if (skb && ++ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { ++ unsigned int mss; ++ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; ++ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true); ++ struct tcp_sock *subtp; ++ ++ WARN_ON(TCP_SKB_CB(skb)->sacked); ++ ++ if (!subsk) ++ goto window_probe; ++ subtp = tcp_sk(subsk); ++ mss = tcp_current_mss(subsk); ++ ++ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq, ++ tcp_wnd_end(subtp) - subtp->write_seq); ++ ++ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) ++ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; ++ ++ /* We are probing the opening of a window ++ * but the window size is != 0 ++ * must have been a result SWS avoidance ( sender ) ++ */ ++ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || ++ skb->len > mss) { ++ seg_size = min(seg_size, mss); ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; ++ if (mptcp_fragment(meta_sk, TCP_FRAG_IN_WRITE_QUEUE, ++ skb, seg_size, GFP_ATOMIC, 0)) ++ return -1; ++ } else if (!tcp_skb_pcount(skb)) { ++ /* see mptcp_write_xmit on why we use UINT_MAX */ ++ tcp_set_skb_tso_segs(skb, UINT_MAX); ++ } ++ ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; ++ if (!mptcp_skb_entail(subsk, skb, 0)) ++ return -1; ++ ++ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - ++ TCP_SKB_CB(skb)->seq); ++ tcp_event_new_data_sent(meta_sk, skb); ++ ++ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH); ++ tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns); ++ meta_tp->lsndtime = tcp_jiffies32; ++ ++ return 0; ++ } else { ++ struct mptcp_tcp_sock *mptcp; ++ ++window_probe: ++ if (between(meta_tp->snd_up, meta_tp->snd_una + 1, ++ meta_tp->snd_una + 0xFFFF)) { ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ ++ if (mptcp_sk_can_send_ack(sk_it)) ++ tcp_xmit_probe_skb(sk_it, 1, mib); ++ } ++ } ++ ++ /* At least one of the tcp_xmit_probe_skb's has to succeed */ ++ mptcp_for_each_sub(meta_tp->mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ int ret; ++ ++ if (!mptcp_sk_can_send_ack(sk_it)) ++ continue; ++ ++ ret = tcp_xmit_probe_skb(sk_it, 0, mib); ++ if (unlikely(ret > 0)) ++ ans = ret; ++ } ++ return ans; ++ } ++} ++ ++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, ++ int push_one, gfp_t gfp) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; ++ bool is_rwnd_limited = false; ++ struct mptcp_tcp_sock *mptcp; ++ struct sock *subsk = NULL; ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct sk_buff *skb; ++ int reinject = 0; ++ unsigned int sublimit; ++ __u32 path_mask = 0; ++ ++ tcp_mstamp_refresh(meta_tp); ++ ++ if (inet_csk(meta_sk)->icsk_retransmits) { ++ /* If the timer already once fired, retransmit the head of the ++ * queue to unblock us ASAP. ++ */ ++ if (meta_tp->packets_out && !mpcb->infinite_mapping_snd) ++ mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk)); ++ } ++ ++ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk, ++ &sublimit))) { ++ enum tcp_queue tcp_queue = TCP_FRAG_IN_WRITE_QUEUE; ++ unsigned int limit; ++ ++ WARN(TCP_SKB_CB(skb)->sacked, "sacked: %u reinject: %u", ++ TCP_SKB_CB(skb)->sacked, reinject); ++ ++ subtp = tcp_sk(subsk); ++ mss_now = tcp_current_mss(subsk); ++ ++ if (reinject == 1) { ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { ++ /* Segment already reached the peer, take the next one */ ++ __skb_unlink(skb, &mpcb->reinject_queue); ++ __kfree_skb(skb); ++ continue; ++ } ++ } else if (reinject == -1) { ++ tcp_queue = TCP_FRAG_IN_RTX_QUEUE; ++ } ++ ++ /* If the segment was cloned (e.g. a meta retransmission), ++ * the header must be expanded/copied so that there is no ++ * corruption of TSO information. ++ */ ++ if (skb_unclone(skb, GFP_ATOMIC)) ++ break; ++ ++ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) { ++ is_rwnd_limited = true; ++ break; ++ } ++ ++ /* Force tso_segs to 1 by using UINT_MAX. ++ * We actually don't care about the exact number of segments ++ * emitted on the subflow. We need just to set tso_segs, because ++ * we still need an accurate packets_out count in ++ * tcp_event_new_data_sent. ++ */ ++ tcp_set_skb_tso_segs(skb, UINT_MAX); ++ ++ /* Check for nagle, irregardless of tso_segs. If the segment is ++ * actually larger than mss_now (TSO segment), then ++ * tcp_nagle_check will have partial == false and always trigger ++ * the transmission. ++ * tcp_write_xmit has a TSO-level nagle check which is not ++ * subject to the MPTCP-level. It is based on the properties of ++ * the subflow, not the MPTCP-level. ++ * When the segment is a reinjection or redundant scheduled ++ * segment, nagle check at meta-level may prevent ++ * sending. This could hurt with certain schedulers, as they ++ * to reinjection to recover from a window-stall or reduce latency. ++ * Therefore, Nagle check should be disabled in that case. ++ */ ++ if (!reinject && ++ unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, ++ (tcp_skb_is_last(meta_sk, skb) ? ++ nonagle : TCP_NAGLE_PUSH)))) ++ break; ++ ++ limit = mss_now; ++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in ++ * tcp_write_xmit. Otherwise split-point would return 0. ++ */ ++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) ++ /* We limit the size of the skb so that it fits into the ++ * window. Call tcp_mss_split_point to avoid duplicating ++ * code. ++ * We really only care about fitting the skb into the ++ * window. That's why we use UINT_MAX. If the skb does ++ * not fit into the cwnd_quota or the NIC's max-segs ++ * limitation, it will be split by the subflow's ++ * tcp_write_xmit which does the appropriate call to ++ * tcp_mss_split_point. ++ */ ++ limit = tcp_mss_split_point(meta_sk, skb, mss_now, ++ UINT_MAX / mss_now, ++ nonagle); ++ ++ if (sublimit) ++ limit = min(limit, sublimit); ++ ++ if (skb->len > limit && ++ unlikely(mptcp_fragment(meta_sk, tcp_queue, ++ skb, limit, gfp, reinject))) ++ break; ++ ++ if (!mptcp_skb_entail(subsk, skb, reinject)) ++ break; ++ ++ if (reinject <= 0) ++ tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns); ++ meta_tp->lsndtime = tcp_jiffies32; ++ ++ path_mask |= mptcp_pi_to_flag(subtp->mptcp->path_index); ++ ++ if (!reinject) { ++ mptcp_check_sndseq_wrap(meta_tp, ++ TCP_SKB_CB(skb)->end_seq - ++ TCP_SKB_CB(skb)->seq); ++ tcp_event_new_data_sent(meta_sk, skb); ++ } ++ ++ tcp_minshall_update(meta_tp, mss_now, skb); ++ ++ if (reinject > 0) { ++ __skb_unlink(skb, &mpcb->reinject_queue); ++ kfree_skb(skb); ++ } ++ ++ if (push_one) ++ break; ++ } ++ ++ if (is_rwnd_limited) ++ tcp_chrono_start(meta_sk, TCP_CHRONO_RWND_LIMITED); ++ else ++ tcp_chrono_stop(meta_sk, TCP_CHRONO_RWND_LIMITED); ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ subsk = mptcp_to_sock(mptcp); ++ subtp = tcp_sk(subsk); ++ ++ if (!(path_mask & mptcp_pi_to_flag(subtp->mptcp->path_index))) ++ continue; ++ ++ mss_now = tcp_current_mss(subsk); ++ ++ /* Nagle is handled at the MPTCP-layer, so ++ * always push on the subflow ++ */ ++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); ++ } ++ ++ return !meta_tp->packets_out && tcp_send_head(meta_sk); ++} ++ ++void mptcp_write_space(struct sock *sk) ++{ ++ mptcp_push_pending_frames(mptcp_meta_sk(sk)); ++} ++ ++u32 __mptcp_select_window(struct sock *sk) ++{ ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ int mss, free_space, full_space, window; ++ ++ /* MSS for the peer's data. Previous versions used mss_clamp ++ * here. I don't know if the value based on our guesses ++ * of peer's MSS is better for the performance. It's more correct ++ * but may be worse for the performance because of rcv_mss ++ * fluctuations. --SAW 1998/11/1 ++ */ ++ mss = icsk->icsk_ack.rcv_mss; ++ free_space = tcp_space(meta_sk); ++ full_space = min_t(int, meta_tp->window_clamp, ++ tcp_full_space(meta_sk)); ++ ++ if (mss > full_space) ++ mss = full_space; ++ ++ if (free_space < (full_space >> 1)) { ++ /* If free_space is decreasing due to mostly meta-level ++ * out-of-order packets, don't turn off the quick-ack mode. ++ */ ++ if (meta_tp->rcv_nxt - meta_tp->copied_seq > ((full_space - free_space) >> 1)) ++ icsk->icsk_ack.quick = 0; ++ ++ if (tcp_memory_pressure) ++ /* TODO this has to be adapted when we support different ++ * MSS's among the subflows. ++ */ ++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, ++ 4U * meta_tp->advmss); ++ ++ if (free_space < mss) ++ return 0; ++ } ++ ++ if (free_space > meta_tp->rcv_ssthresh) ++ free_space = meta_tp->rcv_ssthresh; ++ ++ /* Don't do rounding if we are using window scaling, since the ++ * scaled window will not line up with the MSS boundary anyway. ++ */ ++ window = meta_tp->rcv_wnd; ++ if (tp->rx_opt.rcv_wscale) { ++ window = free_space; ++ ++ /* Advertise enough space so that it won't get scaled away. ++ * Import case: prevent zero window announcement if ++ * 1< mss. ++ */ ++ if (((window >> tp->rx_opt.rcv_wscale) << tp-> ++ rx_opt.rcv_wscale) != window) ++ window = (((window >> tp->rx_opt.rcv_wscale) + 1) ++ << tp->rx_opt.rcv_wscale); ++ } else { ++ /* Get the largest window that is a nice multiple of mss. ++ * Window clamp already applied above. ++ * If our current window offering is within 1 mss of the ++ * free space we just keep it. This prevents the divide ++ * and multiply from happening most of the time. ++ * We also don't do any window rounding when the free space ++ * is too small. ++ */ ++ if (window <= free_space - mss || window > free_space) ++ window = (free_space / mss) * mss; ++ else if (mss == full_space && ++ free_space > window + (full_space >> 1)) ++ window = free_space; ++ } ++ ++ return window; ++} ++ ++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, ++ unsigned *remaining) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ opts->options |= OPTION_MPTCP; ++ if (is_master_tp(tp)) { ++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; ++ opts->mptcp_ver = tp->mptcp_ver; ++ ++ if (tp->mptcp_ver >= MPTCP_VERSION_1) ++ *remaining -= MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN; ++ else ++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; ++ ++ opts->mp_capable.sender_key = tp->mptcp_loc_key; ++ opts->dss_csum = !!sysctl_mptcp_checksum; ++ } else { ++ const struct mptcp_cb *mpcb = tp->mpcb; ++ ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; ++ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; ++ opts->mp_join_syns.token = mpcb->mptcp_rem_token; ++ opts->mp_join_syns.low_prio = tp->mptcp->low_prio; ++ opts->addr_id = tp->mptcp->loc_id; ++ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; ++ } ++} ++ ++void mptcp_synack_options(struct request_sock *req, ++ struct tcp_out_options *opts, unsigned *remaining) ++{ ++ struct mptcp_request_sock *mtreq; ++ mtreq = mptcp_rsk(req); ++ ++ opts->options |= OPTION_MPTCP; ++ /* MPCB not yet set - thus it's a new MPTCP-session */ ++ if (!mtreq->is_sub) { ++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; ++ opts->mptcp_ver = mtreq->mptcp_ver; ++ opts->mp_capable.sender_key = mtreq->mptcp_loc_key; ++ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum; ++ if (mtreq->mptcp_ver >= MPTCP_VERSION_1) { ++ *remaining -= MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN; ++ } else { ++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; ++ } ++ } else { ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; ++ opts->mp_join_syns.sender_truncated_mac = ++ mtreq->mptcp_hash_tmac; ++ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; ++ opts->mp_join_syns.low_prio = mtreq->low_prio; ++ opts->addr_id = mtreq->loc_id; ++ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; ++ } ++} ++ ++void mptcp_established_options(struct sock *sk, struct sk_buff *skb, ++ struct tcp_out_options *opts, unsigned *size) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_cb *mpcb = tp->mpcb; ++ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; ++ ++ /* We are coming from tcp_current_mss with the meta_sk as an argument. ++ * It does not make sense to check for the options, because when the ++ * segment gets sent, another subflow will be chosen. ++ */ ++ if (!skb && is_meta_sk(sk)) ++ return; ++ ++ if (unlikely(tp->send_mp_fclose)) { ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_MP_FCLOSE; ++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; ++ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN; ++ return; ++ } ++ ++ /* 1. If we are the sender of the infinite-mapping, we need the ++ * MPTCPHDR_INF-flag, because a retransmission of the ++ * infinite-announcment still needs the mptcp-option. ++ * ++ * We need infinite_cutoff_seq, because retransmissions from before ++ * the infinite-cutoff-moment still need the MPTCP-signalling to stay ++ * consistent. ++ * ++ * 2. If we are the receiver of the infinite-mapping, we always skip ++ * mptcp-options, because acknowledgments from before the ++ * infinite-mapping point have already been sent out. ++ * ++ * I know, the whole infinite-mapping stuff is ugly... ++ * ++ * TODO: Handle wrapped data-sequence numbers ++ * (even if it's very unlikely) ++ */ ++ if (unlikely(mpcb->infinite_mapping_snd) && ++ ((mpcb->send_infinite_mapping && tcb && ++ mptcp_is_data_seq(skb) && ++ !(tcb->mptcp_flags & MPTCPHDR_INF) && ++ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || ++ !mpcb->send_infinite_mapping)) ++ return; ++ ++ if (unlikely(tp->mptcp->include_mpc)) { ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_MP_CAPABLE | ++ OPTION_TYPE_ACK; ++ ++ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) ++ *size += MPTCPV1_SUB_LEN_CAPABLE_ACK_ALIGN; ++ else ++ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; ++ ++ opts->mptcp_ver = mpcb->mptcp_ver; ++ opts->mp_capable.sender_key = mpcb->mptcp_loc_key; ++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; ++ opts->dss_csum = mpcb->dss_csum; ++ ++ if (skb) ++ tp->mptcp->include_mpc = 0; ++ } ++ if (unlikely(tp->mptcp->pre_established) && ++ (!skb || !(tcb->tcp_flags & (TCPHDR_FIN | TCPHDR_RST)))) { ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; ++ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; ++ } ++ ++ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal && ++ mpcb->mptcp_ver >= MPTCP_VERSION_1 && skb && !mptcp_is_data_seq(skb)) { ++ mpcb->pm_ops->addr_signal(sk, size, opts, skb); ++ ++ if (opts->add_addr_v6) ++ /* Skip subsequent options */ ++ return; ++ } ++ ++ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) { ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_DATA_ACK; ++ /* If !skb, we come from tcp_current_mss and thus we always ++ * assume that the DSS-option will be set for the data-packet. ++ */ ++ if (skb && !mptcp_is_data_seq(skb) && mpcb->rem_key_set) { ++ *size += MPTCP_SUB_LEN_ACK_ALIGN; ++ } else if ((skb && mptcp_is_data_mpcapable(skb)) || ++ (!skb && tp->mpcb->send_mptcpv1_mpcapable)) { ++ *size += MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN; ++ } else { ++ /* Doesn't matter, if csum included or not. It will be ++ * either 10 or 12, and thus aligned = 12 ++ */ ++ if (mpcb->rem_key_set) ++ *size += MPTCP_SUB_LEN_ACK_ALIGN + ++ MPTCP_SUB_LEN_SEQ_ALIGN; ++ else ++ *size += MPTCP_SUB_LEN_SEQ_ALIGN; ++ } ++ ++ *size += MPTCP_SUB_LEN_DSS_ALIGN; ++ } ++ ++ /* In fallback mp_fail-mode, we have to repeat it until the fallback ++ * has been done by the sender ++ */ ++ if (unlikely(tp->mptcp->send_mp_fail) && skb && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_FAIL) { ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_MP_FAIL; ++ *size += MPTCP_SUB_LEN_FAIL; ++ } ++ ++ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal && ++ mpcb->mptcp_ver < MPTCP_VERSION_1) ++ mpcb->pm_ops->addr_signal(sk, size, opts, skb); ++ ++ if (unlikely(tp->mptcp->send_mp_prio) && ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { ++ opts->options |= OPTION_MPTCP; ++ opts->mptcp_options |= OPTION_MP_PRIO; ++ if (skb) ++ tp->mptcp->send_mp_prio = 0; ++ *size += MPTCP_SUB_LEN_PRIO_ALIGN; ++ } ++ ++ return; ++} ++ ++u16 mptcp_select_window(struct sock *sk) ++{ ++ u16 new_win = tcp_select_window(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct tcp_sock *meta_tp = mptcp_meta_tp(tp); ++ ++ meta_tp->rcv_wnd = tp->rcv_wnd; ++ meta_tp->rcv_wup = meta_tp->rcv_nxt; ++ /* no need to use tcp_update_rcv_right_edge, because at the meta level ++ * right edge cannot go back ++ */ ++ meta_tp->rcv_right_edge = meta_tp->rcv_wnd + meta_tp->rcv_wup; ++ ++ return new_win; ++} ++ ++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, ++ const struct tcp_out_options *opts, ++ struct sk_buff *skb) ++{ ++ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { ++ struct mp_capable *mpc = (struct mp_capable *)ptr; ++ ++ mpc->kind = TCPOPT_MPTCP; ++ ++ if (OPTION_TYPE_SYN & opts->mptcp_options) { ++ mpc->ver = opts->mptcp_ver; ++ ++ if (mpc->ver >= MPTCP_VERSION_1) { ++ mpc->len = MPTCPV1_SUB_LEN_CAPABLE_SYN; ++ ptr += MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; ++ } else { ++ mpc->sender_key = opts->mp_capable.sender_key; ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; ++ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; ++ } ++ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { ++ mpc->ver = opts->mptcp_ver; ++ ++ if (mpc->ver >= MPTCP_VERSION_1) { ++ mpc->len = MPTCPV1_SUB_LEN_CAPABLE_SYNACK; ++ ptr += MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN >> 2; ++ } else { ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; ++ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; ++ } ++ ++ mpc->sender_key = opts->mp_capable.sender_key; ++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; ++ mpc->ver = opts->mptcp_ver; ++ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; ++ ++ mpc->sender_key = opts->mp_capable.sender_key; ++ mpc->receiver_key = opts->mp_capable.receiver_key; ++ } ++ ++ mpc->sub = MPTCP_SUB_CAPABLE; ++ mpc->a = opts->dss_csum; ++ mpc->b = 0; ++ mpc->rsv = 0; ++ mpc->h = 1; ++ } ++ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { ++ struct mp_join *mpj = (struct mp_join *)ptr; ++ ++ mpj->kind = TCPOPT_MPTCP; ++ mpj->sub = MPTCP_SUB_JOIN; ++ mpj->rsv = 0; ++ ++ if (OPTION_TYPE_SYN & opts->mptcp_options) { ++ mpj->len = MPTCP_SUB_LEN_JOIN_SYN; ++ mpj->u.syn.token = opts->mp_join_syns.token; ++ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; ++ mpj->b = opts->mp_join_syns.low_prio; ++ mpj->addr_id = opts->addr_id; ++ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; ++ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { ++ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; ++ mpj->u.synack.mac = ++ opts->mp_join_syns.sender_truncated_mac; ++ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; ++ mpj->b = opts->mp_join_syns.low_prio; ++ mpj->addr_id = opts->addr_id; ++ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; ++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) { ++ mpj->len = MPTCP_SUB_LEN_JOIN_ACK; ++ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */ ++ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); ++ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; ++ } ++ } ++ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; ++ struct mptcp_cb *mpcb = tp->mpcb; ++ ++ mpadd->kind = TCPOPT_MPTCP; ++ if (opts->add_addr_v4) { ++ mpadd->addr_id = opts->add_addr4.addr_id; ++ mpadd->u.v4.addr = opts->add_addr4.addr; ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) { ++ mpadd->u_bit.v0.sub = MPTCP_SUB_ADD_ADDR; ++ mpadd->u_bit.v0.ipver = 4; ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; ++ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; ++ } else { ++ mpadd->u_bit.v1.sub = MPTCP_SUB_ADD_ADDR; ++ mpadd->u_bit.v1.rsv = 0; ++ mpadd->u_bit.v1.echo = 0; ++ memcpy((char *)mpadd->u.v4.mac - 2, ++ (char *)&opts->add_addr4.trunc_mac, 8); ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4_VER1; ++ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 >> 2; ++ } ++ } else if (opts->add_addr_v6) { ++ mpadd->addr_id = opts->add_addr6.addr_id; ++ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr, ++ sizeof(mpadd->u.v6.addr)); ++ if (mpcb->mptcp_ver < MPTCP_VERSION_1) { ++ mpadd->u_bit.v0.sub = MPTCP_SUB_ADD_ADDR; ++ mpadd->u_bit.v0.ipver = 6; ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; ++ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; ++ } else { ++ mpadd->u_bit.v1.sub = MPTCP_SUB_ADD_ADDR; ++ mpadd->u_bit.v1.rsv = 0; ++ mpadd->u_bit.v1.echo = 0; ++ memcpy((char *)mpadd->u.v6.mac - 2, ++ (char *)&opts->add_addr6.trunc_mac, 8); ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6_VER1; ++ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 >> 2; ++ } ++ } ++ ++ MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_ADDADDRTX); ++ } ++ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { ++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; ++ u8 *addrs_id; ++ int id, len, len_align; ++ ++ len = mptcp_sub_len_remove_addr(opts->remove_addrs); ++ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); ++ ++ mprem->kind = TCPOPT_MPTCP; ++ mprem->len = len; ++ mprem->sub = MPTCP_SUB_REMOVE_ADDR; ++ mprem->rsv = 0; ++ addrs_id = &mprem->addrs_id; ++ ++ mptcp_for_each_bit_set(opts->remove_addrs, id) ++ *(addrs_id++) = id; ++ ++ /* Fill the rest with NOP's */ ++ if (len_align > len) { ++ int i; ++ for (i = 0; i < len_align - len; i++) ++ *(addrs_id++) = TCPOPT_NOP; ++ } ++ ++ ptr += len_align >> 2; ++ ++ MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_REMADDRTX); ++ } ++ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { ++ struct mp_fail *mpfail = (struct mp_fail *)ptr; ++ ++ mpfail->kind = TCPOPT_MPTCP; ++ mpfail->len = MPTCP_SUB_LEN_FAIL; ++ mpfail->sub = MPTCP_SUB_FAIL; ++ mpfail->rsv1 = 0; ++ mpfail->rsv2 = 0; ++ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq); ++ ++ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; ++ } ++ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { ++ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; ++ ++ mpfclose->kind = TCPOPT_MPTCP; ++ mpfclose->len = MPTCP_SUB_LEN_FCLOSE; ++ mpfclose->sub = MPTCP_SUB_FCLOSE; ++ mpfclose->rsv1 = 0; ++ mpfclose->rsv2 = 0; ++ mpfclose->key = opts->mp_capable.receiver_key; ++ ++ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; ++ } ++ ++ if (OPTION_DATA_ACK & opts->mptcp_options) { ++ if (!mptcp_is_data_seq(skb) && tp->mpcb->rem_key_set) ++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr); ++ else if (mptcp_is_data_mpcapable(skb)) ++ ptr += mptcp_write_mpcapable_data(tp, skb, ptr); ++ else ++ ptr += mptcp_write_dss_data_seq(tp, skb, ptr); ++ } ++ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { ++ struct mp_prio *mpprio = (struct mp_prio *)ptr; ++ ++ mpprio->kind = TCPOPT_MPTCP; ++ mpprio->len = MPTCP_SUB_LEN_PRIO; ++ mpprio->sub = MPTCP_SUB_PRIO; ++ mpprio->rsv = 0; ++ mpprio->b = tp->mptcp->low_prio; ++ mpprio->addr_id = TCPOPT_NOP; ++ ++ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; ++ } ++} ++ ++/* Sends the datafin */ ++void mptcp_send_fin(struct sock *meta_sk) ++{ ++ struct sk_buff *skb, *tskb = tcp_write_queue_tail(meta_sk); ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ int mss_now; ++ ++ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) ++ meta_tp->mpcb->passive_close = 1; ++ ++ /* Optimization, tack on the FIN if we have a queue of ++ * unsent frames. But be careful about outgoing SACKS ++ * and IP options. ++ */ ++ mss_now = mptcp_current_mss(meta_sk); ++ ++ if (tskb) { ++ TCP_SKB_CB(tskb)->mptcp_flags |= MPTCPHDR_FIN; ++ TCP_SKB_CB(tskb)->end_seq++; ++ meta_tp->write_seq++; ++ } else { ++ /* Socket is locked, keep trying until memory is available. */ ++ for (;;) { ++ skb = alloc_skb_fclone(MAX_TCP_HEADER, ++ meta_sk->sk_allocation); ++ if (skb) ++ break; ++ yield(); ++ } ++ /* Reserve space for headers and prepare control bits. */ ++ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); ++ skb_reserve(skb, MAX_TCP_HEADER); ++ ++ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); ++ TCP_SKB_CB(skb)->end_seq++; ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; ++ tcp_queue_skb(meta_sk, skb); ++ } ++ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); ++} ++ ++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct sock *sk; ++ ++ if (hlist_empty(&mpcb->conn_list)) ++ return; ++ ++ WARN_ON(meta_tp->send_mp_fclose); ++ ++ /* First - select a socket */ ++ sk = mptcp_select_ack_sock(meta_sk); ++ ++ /* May happen if no subflow is in an appropriate state, OR ++ * we are in infinite mode or about to go there - just send a reset ++ */ ++ if (!sk || mptcp_in_infinite_mapping_weak(mpcb)) { ++ /* tcp_done must be handled with bh disabled */ ++ if (!in_serving_softirq()) ++ local_bh_disable(); ++ ++ mptcp_sub_force_close_all(mpcb, NULL); ++ ++ if (!in_serving_softirq()) ++ local_bh_enable(); ++ return; ++ } ++ ++ tcp_mstamp_refresh(meta_tp); ++ ++ tcp_sk(sk)->send_mp_fclose = 1; ++ /** Reset all other subflows */ ++ ++ /* tcp_done must be handled with bh disabled */ ++ if (!in_serving_softirq()) ++ local_bh_disable(); ++ ++ mptcp_sub_force_close_all(mpcb, sk); ++ ++ tcp_set_state(sk, TCP_RST_WAIT); ++ ++ if (!in_serving_softirq()) ++ local_bh_enable(); ++ ++ tcp_send_ack(sk); ++ tcp_clear_xmit_timers(sk); ++ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto); ++ ++ meta_tp->send_mp_fclose = 1; ++ inet_csk(sk)->icsk_retransmits = 0; ++ ++ /* Prevent exp backoff reverting on ICMP dest unreachable */ ++ inet_csk(sk)->icsk_backoff = 0; ++ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_FASTCLOSETX); ++} ++ ++static void mptcp_ack_retransmit_timer(struct sock *sk) ++{ ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct sk_buff *skb; ++ ++ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) ++ goto out; /* Routing failure or similar */ ++ ++ tcp_mstamp_refresh(tp); ++ ++ if (tcp_write_timeout(sk)) { ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRTO); ++ tp->mptcp->pre_established = 0; ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); ++ tp->ops->send_active_reset(sk, GFP_ATOMIC); ++ goto out; ++ } ++ ++ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); ++ if (skb == NULL) { ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, ++ jiffies + icsk->icsk_rto); ++ return; ++ } ++ ++ /* Reserve space for headers and prepare control bits */ ++ skb_reserve(skb, MAX_TCP_HEADER); ++ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); ++ ++ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRXMIT); ++ ++ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { ++ /* Retransmission failed because of local congestion, ++ * do not backoff. ++ */ ++ if (!icsk->icsk_retransmits) ++ icsk->icsk_retransmits = 1; ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, ++ jiffies + icsk->icsk_rto); ++ return; ++ } ++ ++ if (!tp->retrans_stamp) ++ tp->retrans_stamp = tcp_time_stamp(tp) ? : 1; ++ ++ icsk->icsk_retransmits++; ++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, ++ jiffies + icsk->icsk_rto); ++ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) ++ __sk_dst_reset(sk); ++ ++out:; ++} ++ ++void mptcp_ack_handler(struct timer_list *t) ++{ ++ struct mptcp_tcp_sock *mptcp = from_timer(mptcp, t, mptcp_ack_timer); ++ struct sock *sk = (struct sock *)mptcp->tp; ++ struct sock *meta_sk = mptcp_meta_sk(sk); ++ ++ bh_lock_sock(meta_sk); ++ if (sock_owned_by_user(meta_sk)) { ++ /* Try again later */ ++ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, ++ jiffies + (HZ / 20)); ++ goto out_unlock; ++ } ++ ++ if (sk->sk_state == TCP_CLOSE) ++ goto out_unlock; ++ if (!tcp_sk(sk)->mptcp->pre_established) ++ goto out_unlock; ++ ++ mptcp_ack_retransmit_timer(sk); ++ ++ sk_mem_reclaim(sk); ++ ++out_unlock: ++ bh_unlock_sock(meta_sk); ++ sock_put(sk); ++} ++ ++/* Similar to tcp_retransmit_skb ++ * ++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the ++ * meta-level. ++ */ ++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct sock *subsk; ++ unsigned int limit, mss_now; ++ int err = -1; ++ ++ WARN_ON(TCP_SKB_CB(skb)->sacked); ++ ++ /* Do not sent more than we queued. 1/4 is reserved for possible ++ * copying overhead: fragmentation, tunneling, mangling etc. ++ * ++ * This is a meta-retransmission thus we check on the meta-socket. ++ */ ++ if (refcount_read(&meta_sk->sk_wmem_alloc) > ++ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { ++ return -EAGAIN; ++ } ++ ++ /* We need to make sure that the retransmitted segment can be sent on a ++ * subflow right now. If it is too big, it needs to be fragmented. ++ */ ++ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false); ++ if (!subsk) { ++ /* We want to increase icsk_retransmits, thus return 0, so that ++ * mptcp_meta_retransmit_timer enters the desired branch. ++ */ ++ err = 0; ++ goto failed; ++ } ++ mss_now = tcp_current_mss(subsk); ++ ++ /* If the segment was cloned (e.g. a meta retransmission), the header ++ * must be expanded/copied so that there is no corruption of TSO ++ * information. ++ */ ++ if (skb_unclone(skb, GFP_ATOMIC)) { ++ err = -ENOMEM; ++ goto failed; ++ } ++ ++ /* Must have been set by mptcp_write_xmit before */ ++ BUG_ON(!tcp_skb_pcount(skb)); ++ ++ limit = mss_now; ++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in ++ * tcp_write_xmit. Otherwise split-point would return 0. ++ */ ++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) ++ limit = tcp_mss_split_point(meta_sk, skb, mss_now, ++ UINT_MAX / mss_now, ++ TCP_NAGLE_OFF); ++ ++ limit = min(limit, tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq); ++ ++ if (skb->len > limit && ++ unlikely(mptcp_fragment(meta_sk, TCP_FRAG_IN_RTX_QUEUE, skb, ++ limit, GFP_ATOMIC, 0))) ++ goto failed; ++ ++ if (!mptcp_skb_entail(subsk, skb, -1)) ++ goto failed; ++ ++ /* Update global TCP statistics. */ ++ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_RETRANSSEGS); ++ ++ /* Diff to tcp_retransmit_skb */ ++ ++ /* Save stamp of the first retransmit. */ ++ if (!meta_tp->retrans_stamp) { ++ tcp_mstamp_refresh(meta_tp); ++ meta_tp->retrans_stamp = tcp_time_stamp(meta_tp); ++ } ++ ++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); ++ tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns); ++ meta_tp->lsndtime = tcp_jiffies32; ++ ++ return 0; ++ ++failed: ++ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL); ++ return err; ++} ++ ++/* Similar to tcp_retransmit_timer ++ * ++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message ++ * and that we don't have an srtt estimation at the meta-level. ++ */ ++void mptcp_meta_retransmit_timer(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); ++ int err; ++ ++ /* In fallback, retransmission is handled at the subflow-level */ ++ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd) ++ return; ++ ++ WARN_ON(tcp_rtx_queue_empty(meta_sk)); ++ ++ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && ++ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { ++ /* Receiver dastardly shrinks window. Our retransmits ++ * become zero probes, but we should not timeout this ++ * connection. If the socket is an orphan, time it out, ++ * we cannot allow such beasts to hang infinitely. ++ */ ++ struct inet_sock *meta_inet = inet_sk(meta_sk); ++ if (meta_sk->sk_family == AF_INET) { ++ net_dbg_ratelimited("MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", ++ &meta_inet->inet_daddr, ++ ntohs(meta_inet->inet_dport), ++ meta_inet->inet_num, meta_tp->snd_una, ++ meta_tp->snd_nxt); ++ } ++#if IS_ENABLED(CONFIG_IPV6) ++ else if (meta_sk->sk_family == AF_INET6) { ++ net_dbg_ratelimited("MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", ++ &meta_sk->sk_v6_daddr, ++ ntohs(meta_inet->inet_dport), ++ meta_inet->inet_num, meta_tp->snd_una, ++ meta_tp->snd_nxt); ++ } ++#endif ++ if (tcp_jiffies32 - meta_tp->rcv_tstamp > TCP_RTO_MAX) { ++ tcp_write_err(meta_sk); ++ return; ++ } ++ ++ mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk)); ++ goto out_reset_timer; ++ } ++ ++ if (tcp_write_timeout(meta_sk)) ++ return; ++ ++ if (meta_icsk->icsk_retransmits == 0) ++ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); ++ ++ meta_icsk->icsk_ca_state = TCP_CA_Loss; ++ ++ err = mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk)); ++ if (err > 0) { ++ /* Retransmission failed because of local congestion, ++ * do not backoff. ++ */ ++ if (!meta_icsk->icsk_retransmits) ++ meta_icsk->icsk_retransmits = 1; ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, ++ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), ++ TCP_RTO_MAX); ++ return; ++ } ++ ++ /* Increase the timeout each time we retransmit. Note that ++ * we do not increase the rtt estimate. rto is initialized ++ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests ++ * that doubling rto each time is the least we can get away with. ++ * In KA9Q, Karn uses this for the first few times, and then ++ * goes to quadratic. netBSD doubles, but only goes up to *64, ++ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is ++ * defined in the protocol as the maximum possible RTT. I guess ++ * we'll have to use something other than TCP to talk to the ++ * University of Mars. ++ * ++ * PAWS allows us longer timeouts and large windows, so once ++ * implemented ftp to mars will work nicely. We will have to fix ++ * the 120 second clamps though! ++ */ ++ meta_icsk->icsk_backoff++; ++ meta_icsk->icsk_retransmits++; ++ ++out_reset_timer: ++ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is ++ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this ++ * might be increased if the stream oscillates between thin and thick, ++ * thus the old value might already be too high compared to the value ++ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without ++ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating ++ * exponential backoff behaviour to avoid continue hammering ++ * linear-timeout retransmissions into a black hole ++ */ ++ if (meta_sk->sk_state == TCP_ESTABLISHED && ++ (meta_tp->thin_lto || sock_net(meta_sk)->ipv4.sysctl_tcp_thin_linear_timeouts) && ++ tcp_stream_is_thin(meta_tp) && ++ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { ++ meta_icsk->icsk_backoff = 0; ++ /* We cannot do the same as in tcp_write_timer because the ++ * srtt is not set here. ++ */ ++ mptcp_set_rto(meta_sk); ++ } else { ++ /* Use normal (exponential) backoff */ ++ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); ++ } ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); ++ ++ return; ++} ++ ++void mptcp_sub_retransmit_timer(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ tcp_retransmit_timer(sk); ++ ++ if (!tp->fastopen_rsk) { ++ mptcp_reinject_data(sk, 1); ++ mptcp_set_rto(sk); ++ } ++} ++ ++/* Modify values to an mptcp-level for the initial window of new subflows */ ++void mptcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, ++ __u32 *rcv_wnd, __u32 *window_clamp, ++ int wscale_ok, __u8 *rcv_wscale, ++ __u32 init_rcv_wnd) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; ++ ++ *window_clamp = mpcb->orig_window_clamp; ++ __space = tcp_win_from_space(sk, mpcb->orig_sk_rcvbuf); ++ ++ tcp_select_initial_window(sk, __space, mss, rcv_wnd, window_clamp, ++ wscale_ok, rcv_wscale, init_rcv_wnd); ++} ++ ++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ u64 rate = 0; ++ ++ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ int this_mss; ++ u64 this_rate; ++ ++ if (!mptcp_sk_can_send(sk)) ++ continue; ++ ++ /* Do not consider subflows without a RTT estimation yet ++ * otherwise this_rate >>> rate. ++ */ ++ if (unlikely(!tp->srtt_us)) ++ continue; ++ ++ this_mss = tcp_current_mss(sk); ++ ++ /* If this_mss is smaller than mss, it means that a segment will ++ * be splitted in two (or more) when pushed on this subflow. If ++ * you consider that mss = 1428 and this_mss = 1420 then two ++ * segments will be generated: a 1420-byte and 8-byte segment. ++ * The latter will introduce a large overhead as for a single ++ * data segment 2 slots will be used in the congestion window. ++ * Therefore reducing by ~2 the potential throughput of this ++ * subflow. Indeed, 1428 will be send while 2840 could have been ++ * sent if mss == 1420 reducing the throughput by 2840 / 1428. ++ * ++ * The following algorithm take into account this overhead ++ * when computing the potential throughput that MPTCP can ++ * achieve when generating mss-byte segments. ++ * ++ * The formulae is the following: ++ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub} ++ * Where ratio is computed as follows: ++ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub} ++ * ++ * ratio gives the reduction factor of the theoretical ++ * throughput a subflow can achieve if MPTCP uses a specific ++ * MSS value. ++ */ ++ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) * ++ max(tp->snd_cwnd, tp->packets_out), ++ (u64)tp->srtt_us * ++ DIV_ROUND_UP(mss, this_mss) * this_mss); ++ rate += this_rate; ++ } ++ ++ return rate; ++} ++ ++static unsigned int __mptcp_current_mss(const struct sock *meta_sk) ++{ ++ struct mptcp_tcp_sock *mptcp; ++ unsigned int mss = 0; ++ u64 rate = 0; ++ ++ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ int this_mss; ++ u64 this_rate; ++ ++ if (!mptcp_sk_can_send(sk)) ++ continue; ++ ++ this_mss = tcp_current_mss(sk); ++ ++ /* Same mss values will produce the same throughput. */ ++ if (this_mss == mss) ++ continue; ++ ++ /* See whether using this mss value can theoretically improve ++ * the performances. ++ */ ++ this_rate = mptcp_calc_rate(meta_sk, this_mss); ++ if (this_rate >= rate) { ++ mss = this_mss; ++ rate = this_rate; ++ } ++ } ++ ++ return mss; ++} ++ ++unsigned int mptcp_current_mss(struct sock *meta_sk) ++{ ++ unsigned int mss = __mptcp_current_mss(meta_sk); ++ ++ /* If no subflow is available, we take a default-mss from the ++ * meta-socket. ++ */ ++ return !mss ? tcp_current_mss(meta_sk) : mss; ++} ++ ++int mptcp_check_snd_buf(const struct tcp_sock *tp) ++{ ++ const struct mptcp_tcp_sock *mptcp; ++ u32 rtt_max = tp->srtt_us; ++ u64 bw_est; ++ ++ if (!tp->srtt_us) ++ return tp->reordering + 1; ++ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ const struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if (!mptcp_sk_can_send(sk)) ++ continue; ++ ++ if (rtt_max < tcp_sk(sk)->srtt_us) ++ rtt_max = tcp_sk(sk)->srtt_us; ++ } ++ ++ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, ++ (u64)tp->srtt_us); ++ ++ return max_t(unsigned int, (u32)(bw_est >> 16), ++ tp->reordering + 1); ++} ++ ++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, ++ int large_allowed) ++{ ++ u32 xmit_size_goal = 0; ++ ++ if (large_allowed && !tcp_sk(meta_sk)->mpcb->dss_csum) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ int this_size_goal; ++ ++ if (!mptcp_sk_can_send(sk)) ++ continue; ++ ++ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); ++ if (this_size_goal > xmit_size_goal) ++ xmit_size_goal = this_size_goal; ++ } ++ } ++ ++ return max(xmit_size_goal, mss_now); ++} ++ +diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c +new file mode 100644 +index 000000000000..0e24e0aaa70a +--- /dev/null ++++ b/net/mptcp/mptcp_pm.c +@@ -0,0 +1,226 @@ ++/* ++ * MPTCP implementation - MPTCP-subflow-management ++ * ++ * Initial Design & Implementation: ++ * SĂ©bastien BarrĂ© ++ * ++ * Current Maintainer & Author: ++ * Christoph Paasch ++ * ++ * Additional authors: ++ * Jaakko Korkeaniemi ++ * Gregory Detal ++ * Fabien DuchĂªne ++ * Andreas Seelinger ++ * Lavkesh Lahngir ++ * Andreas Ripke ++ * Vlad Dogaru ++ * Octavian Purdila ++ * John Ronan ++ * Catalin Nicutar ++ * Brandon Heller ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++ ++#include ++#include ++ ++static DEFINE_SPINLOCK(mptcp_pm_list_lock); ++static LIST_HEAD(mptcp_pm_list); ++ ++static int mptcp_default_id(const struct sock *meta_sk, sa_family_t family, ++ union inet_addr *addr, bool *low_prio) ++{ ++ return 0; ++} ++ ++struct mptcp_pm_ops mptcp_pm_default = { ++ .get_local_id = mptcp_default_id, /* We do not care */ ++ .name = "default", ++ .owner = THIS_MODULE, ++}; ++ ++static struct mptcp_pm_ops *mptcp_pm_find(const char *name) ++{ ++ struct mptcp_pm_ops *e; ++ ++ list_for_each_entry_rcu(e, &mptcp_pm_list, list) { ++ if (strcmp(e->name, name) == 0) ++ return e; ++ } ++ ++ return NULL; ++} ++ ++int mptcp_register_path_manager(struct mptcp_pm_ops *pm) ++{ ++ int ret = 0; ++ ++ if (!pm->get_local_id) ++ return -EINVAL; ++ ++ spin_lock(&mptcp_pm_list_lock); ++ if (mptcp_pm_find(pm->name)) { ++ pr_notice("%s already registered\n", pm->name); ++ ret = -EEXIST; ++ } else { ++ list_add_tail_rcu(&pm->list, &mptcp_pm_list); ++ pr_info("%s registered\n", pm->name); ++ } ++ spin_unlock(&mptcp_pm_list_lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(mptcp_register_path_manager); ++ ++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm) ++{ ++ spin_lock(&mptcp_pm_list_lock); ++ list_del_rcu(&pm->list); ++ spin_unlock(&mptcp_pm_list_lock); ++ ++ /* Wait for outstanding readers to complete before the ++ * module gets removed entirely. ++ * ++ * A try_module_get() should fail by now as our module is ++ * in "going" state since no refs are held anymore and ++ * module_exit() handler being called. ++ */ ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager); ++ ++void mptcp_get_default_path_manager(char *name) ++{ ++ struct mptcp_pm_ops *pm; ++ ++ BUG_ON(list_empty(&mptcp_pm_list)); ++ ++ rcu_read_lock(); ++ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list); ++ strncpy(name, pm->name, MPTCP_PM_NAME_MAX); ++ rcu_read_unlock(); ++} ++ ++int mptcp_set_default_path_manager(const char *name) ++{ ++ struct mptcp_pm_ops *pm; ++ int ret = -ENOENT; ++ ++ spin_lock(&mptcp_pm_list_lock); ++ pm = mptcp_pm_find(name); ++#ifdef CONFIG_MODULES ++ if (!pm && capable(CAP_NET_ADMIN)) { ++ spin_unlock(&mptcp_pm_list_lock); ++ ++ request_module("mptcp_%s", name); ++ spin_lock(&mptcp_pm_list_lock); ++ pm = mptcp_pm_find(name); ++ } ++#endif ++ ++ if (pm) { ++ list_move(&pm->list, &mptcp_pm_list); ++ ret = 0; ++ } else { ++ pr_info("%s is not available\n", name); ++ } ++ spin_unlock(&mptcp_pm_list_lock); ++ ++ return ret; ++} ++ ++static struct mptcp_pm_ops *__mptcp_pm_find_autoload(const char *name) ++{ ++ struct mptcp_pm_ops *pm = mptcp_pm_find(name); ++#ifdef CONFIG_MODULES ++ if (!pm && capable(CAP_NET_ADMIN)) { ++ rcu_read_unlock(); ++ request_module("mptcp_%s", name); ++ rcu_read_lock(); ++ pm = mptcp_pm_find(name); ++ } ++#endif ++ return pm; ++} ++ ++void mptcp_init_path_manager(struct mptcp_cb *mpcb) ++{ ++ struct mptcp_pm_ops *pm; ++ struct sock *meta_sk = mpcb->meta_sk; ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ ++ rcu_read_lock(); ++ /* if path manager was set using socket option */ ++ if (meta_tp->mptcp_pm_setsockopt) { ++ pm = __mptcp_pm_find_autoload(meta_tp->mptcp_pm_name); ++ if (pm && try_module_get(pm->owner)) { ++ mpcb->pm_ops = pm; ++ goto out; ++ } ++ } ++ ++ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) { ++ if (try_module_get(pm->owner)) { ++ mpcb->pm_ops = pm; ++ break; ++ } ++ } ++out: ++ rcu_read_unlock(); ++} ++ ++/* Change path manager for socket */ ++int mptcp_set_path_manager(struct sock *sk, const char *name) ++{ ++ struct mptcp_pm_ops *pm; ++ int err = 0; ++ ++ rcu_read_lock(); ++ pm = __mptcp_pm_find_autoload(name); ++ ++ if (!pm) { ++ err = -ENOENT; ++ } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ++ err = -EPERM; ++ } else { ++ strcpy(tcp_sk(sk)->mptcp_pm_name, name); ++ tcp_sk(sk)->mptcp_pm_setsockopt = 1; ++ } ++ rcu_read_unlock(); ++ ++ return err; ++} ++ ++/* Manage refcounts on socket close. */ ++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb) ++{ ++ module_put(mpcb->pm_ops->owner); ++} ++ ++/* Fallback to the default path-manager. */ ++void mptcp_fallback_default(struct mptcp_cb *mpcb) ++{ ++ struct mptcp_pm_ops *pm; ++ ++ mptcp_cleanup_path_manager(mpcb); ++ pm = mptcp_pm_find("default"); ++ ++ /* Cannot fail - it's the default module */ ++ try_module_get(pm->owner); ++ mpcb->pm_ops = pm; ++} ++EXPORT_SYMBOL_GPL(mptcp_fallback_default); ++ ++/* Set default value from kernel configuration at bootup */ ++static int __init mptcp_path_manager_default(void) ++{ ++ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM); ++} ++late_initcall(mptcp_path_manager_default); +diff --git a/net/mptcp/mptcp_redundant.c b/net/mptcp/mptcp_redundant.c +new file mode 100644 +index 000000000000..3db4e69acef2 +--- /dev/null ++++ b/net/mptcp/mptcp_redundant.c +@@ -0,0 +1,395 @@ ++/* ++ * MPTCP Scheduler to reduce latency and jitter. ++ * ++ * This scheduler sends all packets redundantly on all available subflows. ++ * ++ * Initial Design & Implementation: ++ * Tobias Erbshaeusser ++ * Alexander Froemmgen ++ * ++ * Initial corrections & modifications: ++ * Christian Pinedo ++ * Igor Lopez ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++ ++/* Struct to store the data of a single subflow */ ++struct redsched_priv { ++ /* The skb or NULL */ ++ struct sk_buff *skb; ++ /* Start/end sequence number of the skb. This number should be checked ++ * to be valid before the skb field is used ++ */ ++ u32 skb_start_seq; ++ u32 skb_end_seq; ++}; ++ ++/* Struct to store the data of the control block */ ++struct redsched_cb { ++ /* The next subflow where a skb should be sent or NULL */ ++ struct tcp_sock *next_subflow; ++}; ++ ++/* Returns the socket data from a given subflow socket */ ++static struct redsched_priv *redsched_get_priv(struct tcp_sock *tp) ++{ ++ return (struct redsched_priv *)&tp->mptcp->mptcp_sched[0]; ++} ++ ++/* Returns the control block data from a given meta socket */ ++static struct redsched_cb *redsched_get_cb(struct tcp_sock *tp) ++{ ++ return (struct redsched_cb *)&tp->mpcb->mptcp_sched[0]; ++} ++ ++static bool redsched_get_active_valid_sks(struct sock *meta_sk) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct mptcp_tcp_sock *mptcp; ++ int active_valid_sks = 0; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if (subflow_is_active((struct tcp_sock *)sk) && ++ !mptcp_is_def_unavailable(sk)) ++ active_valid_sks++; ++ } ++ ++ return active_valid_sks; ++} ++ ++static bool redsched_use_subflow(struct sock *meta_sk, ++ int active_valid_sks, ++ struct tcp_sock *tp, ++ struct sk_buff *skb) ++{ ++ if (!skb || !mptcp_is_available((struct sock *)tp, skb, false)) ++ return false; ++ ++ if (TCP_SKB_CB(skb)->path_mask != 0) ++ return subflow_is_active(tp); ++ ++ if (TCP_SKB_CB(skb)->path_mask == 0) { ++ if (active_valid_sks == -1) ++ active_valid_sks = redsched_get_active_valid_sks(meta_sk); ++ ++ if (subflow_is_backup(tp) && active_valid_sks > 0) ++ return false; ++ else ++ return true; ++ } ++ ++ return false; ++} ++ ++#define mptcp_entry_next_rcu(__mptcp) \ ++ hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ ++ &(__mptcp)->node)), struct mptcp_tcp_sock, node) ++ ++static void redsched_update_next_subflow(struct tcp_sock *tp, ++ struct redsched_cb *red_cb) ++{ ++ struct mptcp_tcp_sock *mptcp = mptcp_entry_next_rcu(tp->mptcp); ++ ++ if (mptcp) ++ red_cb->next_subflow = mptcp->tp; ++ else ++ red_cb->next_subflow = NULL; ++} ++ ++static struct sock *red_get_available_subflow(struct sock *meta_sk, ++ struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct redsched_cb *red_cb = redsched_get_cb(meta_tp); ++ struct tcp_sock *first_tp = red_cb->next_subflow, *tp; ++ struct mptcp_tcp_sock *mptcp; ++ int found = 0; ++ ++ /* Answer data_fin on same subflow */ ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && ++ skb && mptcp_is_data_fin(skb)) { ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(sk)->mptcp->path_index == ++ mpcb->dfin_path_index && ++ mptcp_is_available(sk, skb, zero_wnd_test)) ++ return sk; ++ } ++ } ++ ++ if (!first_tp && !hlist_empty(&mpcb->conn_list)) { ++ first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)), ++ struct mptcp_tcp_sock, node)->tp; ++ } ++ tp = first_tp; ++ ++ /* still NULL (no subflow in conn_list?) */ ++ if (!first_tp) ++ return NULL; ++ ++ /* Search for a subflow to send it. ++ * ++ * We want to pick a subflow that is after 'first_tp' in the list of subflows. ++ * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up ++ * to the subflow 'tp' and then checks whether any one of the remaining ++ * ones is eligible to send. ++ * The second mptcp_for_each-sub()-loop is then iterating from the ++ * beginning of the list up to 'first_tp'. ++ */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ /* We go up to the subflow 'tp' and start from there */ ++ if (tp == mptcp->tp) ++ found = 1; ++ ++ if (!found) ++ continue; ++ tp = mptcp->tp; ++ ++ if (mptcp_is_available((struct sock *)tp, skb, ++ zero_wnd_test)) { ++ redsched_update_next_subflow(tp, red_cb); ++ return (struct sock *)tp; ++ } ++ } ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ tp = mptcp->tp; ++ ++ if (tp == first_tp) ++ break; ++ ++ if (mptcp_is_available((struct sock *)tp, skb, ++ zero_wnd_test)) { ++ redsched_update_next_subflow(tp, red_cb); ++ return (struct sock *)tp; ++ } ++ } ++ ++ /* No space */ ++ return NULL; ++} ++ ++/* Corrects the stored skb pointers if they are invalid */ ++static void redsched_correct_skb_pointers(struct sock *meta_sk, ++ struct redsched_priv *red_p) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ ++ if (red_p->skb && ++ (!after(red_p->skb_start_seq, meta_tp->snd_una) || ++ after(red_p->skb_end_seq, meta_tp->snd_nxt))) ++ red_p->skb = NULL; ++} ++ ++/* Returns the next skb from the queue */ ++static struct sk_buff *redsched_next_skb_from_queue(struct sk_buff_head *queue, ++ struct sk_buff *previous, ++ struct sock *meta_sk) ++{ ++ struct sk_buff *skb; ++ ++ if (!previous) ++ return tcp_rtx_queue_head(meta_sk) ? : skb_peek(queue); ++ ++ /* sk_data->skb stores the last scheduled packet for this subflow. ++ * If sk_data->skb was scheduled but not sent (e.g., due to nagle), ++ * we have to schedule it again. ++ * ++ * For the redundant scheduler, there are two cases: ++ * 1. sk_data->skb was not sent on another subflow: ++ * we have to schedule it again to ensure that we do not ++ * skip this packet. ++ * 2. sk_data->skb was already sent on another subflow: ++ * with regard to the redundant semantic, we have to ++ * schedule it again. However, we keep it simple and ignore it, ++ * as it was already sent by another subflow. ++ * This might be changed in the future. ++ * ++ * For case 1, send_head is equal previous, as only a single ++ * packet can be skipped. ++ */ ++ if (tcp_send_head(meta_sk) == previous) ++ return tcp_send_head(meta_sk); ++ ++ skb = skb_rb_next(previous); ++ if (skb) ++ return skb; ++ ++ return tcp_send_head(meta_sk); ++} ++ ++static struct sk_buff *mptcp_red_next_segment(struct sock *meta_sk, ++ int *reinject, ++ struct sock **subsk, ++ unsigned int *limit) ++{ ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ struct mptcp_cb *mpcb = meta_tp->mpcb; ++ struct redsched_cb *red_cb = redsched_get_cb(meta_tp); ++ struct tcp_sock *first_tp = red_cb->next_subflow, *tp; ++ struct mptcp_tcp_sock *mptcp; ++ int active_valid_sks = -1; ++ struct sk_buff *skb; ++ int found = 0; ++ ++ /* As we set it, we have to reset it as well. */ ++ *limit = 0; ++ ++ if (skb_queue_empty(&mpcb->reinject_queue) && ++ skb_queue_empty(&meta_sk->sk_write_queue) && ++ tcp_rtx_queue_empty(meta_sk)) ++ /* Nothing to send */ ++ return NULL; ++ ++ /* First try reinjections */ ++ skb = skb_peek(&mpcb->reinject_queue); ++ if (skb) { ++ *subsk = get_available_subflow(meta_sk, skb, false); ++ if (!*subsk) ++ return NULL; ++ *reinject = 1; ++ return skb; ++ } ++ ++ /* Then try indistinctly redundant and normal skbs */ ++ ++ if (!first_tp && !hlist_empty(&mpcb->conn_list)) { ++ first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)), ++ struct mptcp_tcp_sock, node)->tp; ++ } ++ ++ /* still NULL (no subflow in conn_list?) */ ++ if (!first_tp) ++ return NULL; ++ ++ tp = first_tp; ++ ++ *reinject = 0; ++ active_valid_sks = redsched_get_active_valid_sks(meta_sk); ++ ++ /* We want to pick a subflow that is after 'first_tp' in the list of subflows. ++ * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up ++ * to the subflow 'tp' and then checks whether any one of the remaining ++ * ones can send a segment. ++ * The second mptcp_for_each-sub()-loop is then iterating from the ++ * beginning of the list up to 'first_tp'. ++ */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct redsched_priv *red_p; ++ ++ if (tp == mptcp->tp) ++ found = 1; ++ ++ if (!found) ++ continue; ++ ++ tp = mptcp->tp; ++ ++ /* Correct the skb pointers of the current subflow */ ++ red_p = redsched_get_priv(tp); ++ redsched_correct_skb_pointers(meta_sk, red_p); ++ ++ skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue, ++ red_p->skb, meta_sk); ++ if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp, ++ skb)) { ++ red_p->skb = skb; ++ red_p->skb_start_seq = TCP_SKB_CB(skb)->seq; ++ red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq; ++ redsched_update_next_subflow(tp, red_cb); ++ *subsk = (struct sock *)tp; ++ ++ if (TCP_SKB_CB(skb)->path_mask) ++ *reinject = -1; ++ return skb; ++ } ++ } ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct redsched_priv *red_p; ++ ++ tp = mptcp->tp; ++ ++ if (tp == first_tp) ++ break; ++ ++ /* Correct the skb pointers of the current subflow */ ++ red_p = redsched_get_priv(tp); ++ redsched_correct_skb_pointers(meta_sk, red_p); ++ ++ skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue, ++ red_p->skb, meta_sk); ++ if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp, ++ skb)) { ++ red_p->skb = skb; ++ red_p->skb_start_seq = TCP_SKB_CB(skb)->seq; ++ red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq; ++ redsched_update_next_subflow(tp, red_cb); ++ *subsk = (struct sock *)tp; ++ ++ if (TCP_SKB_CB(skb)->path_mask) ++ *reinject = -1; ++ return skb; ++ } ++ } ++ ++ /* Nothing to send */ ++ return NULL; ++} ++ ++static void redsched_release(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct redsched_cb *red_cb = redsched_get_cb(tp); ++ ++ /* Check if the next subflow would be the released one. If yes correct ++ * the pointer ++ */ ++ if (red_cb->next_subflow == tp) ++ redsched_update_next_subflow(tp, red_cb); ++} ++ ++static struct mptcp_sched_ops mptcp_sched_red = { ++ .get_subflow = red_get_available_subflow, ++ .next_segment = mptcp_red_next_segment, ++ .release = redsched_release, ++ .name = "redundant", ++ .owner = THIS_MODULE, ++}; ++ ++static int __init red_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct redsched_priv) > MPTCP_SCHED_SIZE); ++ BUILD_BUG_ON(sizeof(struct redsched_cb) > MPTCP_SCHED_DATA_SIZE); ++ ++ if (mptcp_register_scheduler(&mptcp_sched_red)) ++ return -1; ++ ++ return 0; ++} ++ ++static void red_unregister(void) ++{ ++ mptcp_unregister_scheduler(&mptcp_sched_red); ++} ++ ++module_init(red_register); ++module_exit(red_unregister); ++ ++MODULE_AUTHOR("Tobias Erbshaeusser, Alexander Froemmgen"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("REDUNDANT MPTCP"); ++MODULE_VERSION("0.90"); +diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c +new file mode 100644 +index 000000000000..396e8aaf4762 +--- /dev/null ++++ b/net/mptcp/mptcp_rr.c +@@ -0,0 +1,309 @@ ++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ ++ ++#include ++#include ++ ++static unsigned char num_segments __read_mostly = 1; ++module_param(num_segments, byte, 0644); ++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst"); ++ ++static bool cwnd_limited __read_mostly = 1; ++module_param(cwnd_limited, bool, 0644); ++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows"); ++ ++struct rrsched_priv { ++ unsigned char quota; ++}; ++ ++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp) ++{ ++ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0]; ++} ++ ++/* If the sub-socket sk available to send the skb? */ ++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb, ++ bool zero_wnd_test, bool cwnd_test) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ unsigned int space, in_flight; ++ ++ /* Set of states for which we are allowed to send data */ ++ if (!mptcp_sk_can_send(sk)) ++ return false; ++ ++ /* We do not send data on this subflow unless it is ++ * fully established, i.e. the 4th ack has been received. ++ */ ++ if (tp->mptcp->pre_established) ++ return false; ++ ++ if (tp->pf) ++ return false; ++ ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { ++ /* If SACK is disabled, and we got a loss, TCP does not exit ++ * the loss-state until something above high_seq has been acked. ++ * (see tcp_try_undo_recovery) ++ * ++ * high_seq is the snd_nxt at the moment of the RTO. As soon ++ * as we have an RTO, we won't push data on the subflow. ++ * Thus, snd_una can never go beyond high_seq. ++ */ ++ if (!tcp_is_reno(tp)) ++ return false; ++ else if (tp->snd_una != tp->high_seq) ++ return false; ++ } ++ ++ if (!tp->mptcp->fully_established) { ++ /* Make sure that we send in-order data */ ++ if (skb && tp->mptcp->second_packet && ++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) ++ return false; ++ } ++ ++ if (!cwnd_test) ++ goto zero_wnd_test; ++ ++ in_flight = tcp_packets_in_flight(tp); ++ /* Not even a single spot in the cwnd */ ++ if (in_flight >= tp->snd_cwnd) ++ return false; ++ ++ /* Now, check if what is queued in the subflow's send-queue ++ * already fills the cwnd. ++ */ ++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; ++ ++ if (tp->write_seq - tp->snd_nxt > space) ++ return false; ++ ++zero_wnd_test: ++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) ++ return false; ++ ++ return true; ++} ++ ++/* Are we not allowed to reinject this skb on tp? */ ++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) ++{ ++ /* If the skb has already been enqueued in this sk, try to find ++ * another one. ++ */ ++ return skb && ++ /* Has the skb already been enqueued into this subsocket? */ ++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; ++} ++ ++/* We just look for any subflow that is available */ ++static struct sock *rr_get_available_subflow(struct sock *meta_sk, ++ struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sock *sk = NULL, *bestsk = NULL, *backupsk = NULL; ++ struct mptcp_tcp_sock *mptcp; ++ ++ /* Answer data_fin on same subflow!!! */ ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && ++ skb && mptcp_is_data_fin(skb)) { ++ mptcp_for_each_sub(mpcb, mptcp) { ++ sk = mptcp_to_sock(mptcp); ++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && ++ mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) ++ return sk; ++ } ++ } ++ ++ /* First, find the best subflow */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct tcp_sock *tp; ++ ++ sk = mptcp_to_sock(mptcp); ++ tp = tcp_sk(sk); ++ ++ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) ++ continue; ++ ++ if (mptcp_rr_dont_reinject_skb(tp, skb)) { ++ backupsk = sk; ++ continue; ++ } ++ ++ bestsk = sk; ++ } ++ ++ if (bestsk) { ++ sk = bestsk; ++ } else if (backupsk) { ++ /* It has been sent on all subflows once - let's give it a ++ * chance again by restarting its pathmask. ++ */ ++ if (skb) ++ TCP_SKB_CB(skb)->path_mask = 0; ++ sk = backupsk; ++ } ++ ++ return sk; ++} ++ ++/* Returns the next segment to be sent from the mptcp meta-queue. ++ * (chooses the reinject queue if any segment is waiting in it, otherwise, ++ * chooses the normal write queue). ++ * Sets *@reinject to 1 if the returned segment comes from the ++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, ++ * and sets it to -1 if it is a meta-level retransmission to optimize the ++ * receive-buffer. ++ */ ++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sk_buff *skb = NULL; ++ ++ *reinject = 0; ++ ++ /* If we are in fallback-mode, just take from the meta-send-queue */ ++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) ++ return tcp_send_head(meta_sk); ++ ++ skb = skb_peek(&mpcb->reinject_queue); ++ ++ if (skb) ++ *reinject = 1; ++ else ++ skb = tcp_send_head(meta_sk); ++ return skb; ++} ++ ++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk, ++ int *reinject, ++ struct sock **subsk, ++ unsigned int *limit) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sock *choose_sk = NULL; ++ struct mptcp_tcp_sock *mptcp; ++ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject); ++ unsigned char split = num_segments; ++ unsigned char iter = 0, full_subs = 0; ++ ++ /* As we set it, we have to reset it as well. */ ++ *limit = 0; ++ ++ if (!skb) ++ return NULL; ++ ++ if (*reinject) { ++ *subsk = rr_get_available_subflow(meta_sk, skb, false); ++ if (!*subsk) ++ return NULL; ++ ++ return skb; ++ } ++ ++retry: ++ ++ /* First, we look for a subflow who is currently being used */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp_it = tcp_sk(sk_it); ++ struct rrsched_priv *rr_p = rrsched_get_priv(tp_it); ++ ++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) ++ continue; ++ ++ iter++; ++ ++ /* Is this subflow currently being used? */ ++ if (rr_p->quota > 0 && rr_p->quota < num_segments) { ++ split = num_segments - rr_p->quota; ++ choose_sk = sk_it; ++ goto found; ++ } ++ ++ /* Or, it's totally unused */ ++ if (!rr_p->quota) { ++ split = num_segments; ++ choose_sk = sk_it; ++ } ++ ++ /* Or, it must then be fully used */ ++ if (rr_p->quota >= num_segments) ++ full_subs++; ++ } ++ ++ /* All considered subflows have a full quota, and we considered at ++ * least one. ++ */ ++ if (iter && iter == full_subs) { ++ /* So, we restart this round by setting quota to 0 and retry ++ * to find a subflow. ++ */ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk_it = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp_it = tcp_sk(sk_it); ++ struct rrsched_priv *rr_p = rrsched_get_priv(tp_it); ++ ++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) ++ continue; ++ ++ rr_p->quota = 0; ++ } ++ ++ goto retry; ++ } ++ ++found: ++ if (choose_sk) { ++ unsigned int mss_now; ++ struct tcp_sock *choose_tp = tcp_sk(choose_sk); ++ struct rrsched_priv *rr_p = rrsched_get_priv(choose_tp); ++ ++ if (!mptcp_rr_is_available(choose_sk, skb, false, true)) ++ return NULL; ++ ++ *subsk = choose_sk; ++ mss_now = tcp_current_mss(*subsk); ++ *limit = split * mss_now; ++ ++ if (skb->len > mss_now) ++ rr_p->quota += DIV_ROUND_UP(skb->len, mss_now); ++ else ++ rr_p->quota++; ++ ++ return skb; ++ } ++ ++ return NULL; ++} ++ ++static struct mptcp_sched_ops mptcp_sched_rr = { ++ .get_subflow = rr_get_available_subflow, ++ .next_segment = mptcp_rr_next_segment, ++ .name = "roundrobin", ++ .owner = THIS_MODULE, ++}; ++ ++static int __init rr_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE); ++ ++ if (mptcp_register_scheduler(&mptcp_sched_rr)) ++ return -1; ++ ++ return 0; ++} ++ ++static void rr_unregister(void) ++{ ++ mptcp_unregister_scheduler(&mptcp_sched_rr); ++} ++ ++module_init(rr_register); ++module_exit(rr_unregister); ++ ++MODULE_AUTHOR("Christoph Paasch"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("ROUNDROBIN MPTCP"); ++MODULE_VERSION("0.89"); +diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c +new file mode 100644 +index 000000000000..eed9bfb44b59 +--- /dev/null ++++ b/net/mptcp/mptcp_sched.c +@@ -0,0 +1,677 @@ ++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ ++ ++#include ++#include ++#include ++#include ++ ++static DEFINE_SPINLOCK(mptcp_sched_list_lock); ++static LIST_HEAD(mptcp_sched_list); ++ ++struct defsched_priv { ++ u32 last_rbuf_opti; ++}; ++ ++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp) ++{ ++ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0]; ++} ++ ++bool mptcp_is_def_unavailable(struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ /* Set of states for which we are allowed to send data */ ++ if (!mptcp_sk_can_send(sk)) ++ return true; ++ ++ /* We do not send data on this subflow unless it is ++ * fully established, i.e. the 4th ack has been received. ++ */ ++ if (tp->mptcp->pre_established) ++ return true; ++ ++ if (tp->pf) ++ return true; ++ ++ return false; ++} ++EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable); ++ ++/* estimate number of segments currently in flight + unsent in ++ * the subflow socket. ++ */ ++static int mptcp_subflow_queued(struct sock *sk, u32 max_tso_segs) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ unsigned int queued; ++ ++ /* estimate the max number of segments in the write queue ++ * this is an overestimation, avoiding to iterate over the queue ++ * to make a better estimation. ++ * Having only one skb in the queue however might trigger tso deferral, ++ * delaying the sending of a tso segment in the hope that skb_entail ++ * will append more data to the skb soon. ++ * Therefore, in the case only one skb is in the queue, we choose to ++ * potentially underestimate, risking to schedule one skb too many onto ++ * the subflow rather than not enough. ++ */ ++ if (sk->sk_write_queue.qlen > 1) ++ queued = sk->sk_write_queue.qlen * max_tso_segs; ++ else ++ queued = sk->sk_write_queue.qlen; ++ ++ return queued + tcp_packets_in_flight(tp); ++} ++ ++static bool mptcp_is_temp_unavailable(struct sock *sk, ++ const struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ unsigned int mss_now; ++ ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { ++ /* If SACK is disabled, and we got a loss, TCP does not exit ++ * the loss-state until something above high_seq has been ++ * acked. (see tcp_try_undo_recovery) ++ * ++ * high_seq is the snd_nxt at the moment of the RTO. As soon ++ * as we have an RTO, we won't push data on the subflow. ++ * Thus, snd_una can never go beyond high_seq. ++ */ ++ if (!tcp_is_reno(tp)) ++ return true; ++ else if (tp->snd_una != tp->high_seq) ++ return true; ++ } ++ ++ if (!tp->mptcp->fully_established) { ++ /* Make sure that we send in-order data */ ++ if (skb && tp->mptcp->second_packet && ++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) ++ return true; ++ } ++ ++ mss_now = tcp_current_mss(sk); ++ ++ /* Not even a single spot in the cwnd */ ++ if (mptcp_subflow_queued(sk, tcp_tso_segs(sk, mss_now)) >= tp->snd_cwnd) ++ return true; ++ ++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) ++ return true; ++ ++ /* Don't send on this subflow if we bypass the allowed send-window at ++ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually ++ * calculated end_seq (because here at this point end_seq is still at ++ * the meta-level). ++ */ ++ if (skb && zero_wnd_test && ++ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) ++ return true; ++ ++ return false; ++} ++ ++/* Is the sub-socket sk available to send the skb? */ ++bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ return !mptcp_is_def_unavailable(sk) && ++ !mptcp_is_temp_unavailable(sk, skb, zero_wnd_test); ++} ++EXPORT_SYMBOL_GPL(mptcp_is_available); ++ ++/* Are we not allowed to reinject this skb on tp? */ ++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) ++{ ++ /* If the skb has already been enqueued in this sk, try to find ++ * another one. ++ */ ++ return skb && ++ /* Has the skb already been enqueued into this subsocket? */ ++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; ++} ++ ++bool subflow_is_backup(const struct tcp_sock *tp) ++{ ++ return tp->mptcp->rcv_low_prio || tp->mptcp->low_prio; ++} ++EXPORT_SYMBOL_GPL(subflow_is_backup); ++ ++bool subflow_is_active(const struct tcp_sock *tp) ++{ ++ return !tp->mptcp->rcv_low_prio && !tp->mptcp->low_prio; ++} ++EXPORT_SYMBOL_GPL(subflow_is_active); ++ ++/* Generic function to iterate over used and unused subflows and to select the ++ * best one ++ */ ++static struct sock ++*get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb, ++ bool (*selector)(const struct tcp_sock *), ++ bool zero_wnd_test, bool *force) ++{ ++ struct sock *bestsk = NULL; ++ u32 min_srtt = 0xffffffff; ++ bool found_unused = false; ++ bool found_unused_una = false; ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sk = mptcp_to_sock(mptcp); ++ struct tcp_sock *tp = tcp_sk(sk); ++ bool unused = false; ++ ++ /* First, we choose only the wanted sks */ ++ if (!(*selector)(tp)) ++ continue; ++ ++ if (!mptcp_dont_reinject_skb(tp, skb)) ++ unused = true; ++ else if (found_unused) ++ /* If a unused sk was found previously, we continue - ++ * no need to check used sks anymore. ++ */ ++ continue; ++ ++ if (mptcp_is_def_unavailable(sk)) ++ continue; ++ ++ if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) { ++ if (unused) ++ found_unused_una = true; ++ continue; ++ } ++ ++ if (unused) { ++ if (!found_unused) { ++ /* It's the first time we encounter an unused ++ * sk - thus we reset the bestsk (which might ++ * have been set to a used sk). ++ */ ++ min_srtt = 0xffffffff; ++ bestsk = NULL; ++ } ++ found_unused = true; ++ } ++ ++ if (tp->srtt_us < min_srtt) { ++ min_srtt = tp->srtt_us; ++ bestsk = sk; ++ } ++ } ++ ++ if (bestsk) { ++ /* The force variable is used to mark the returned sk as ++ * previously used or not-used. ++ */ ++ if (found_unused) ++ *force = true; ++ else ++ *force = false; ++ } else { ++ /* The force variable is used to mark if there are temporally ++ * unavailable not-used sks. ++ */ ++ if (found_unused_una) ++ *force = true; ++ else ++ *force = false; ++ } ++ ++ return bestsk; ++} ++ ++/* This is the scheduler. This function decides on which flow to send ++ * a given MSS. If all subflows are found to be busy, NULL is returned ++ * The flow is selected based on the shortest RTT. ++ * If all paths have full cong windows, we simply return NULL. ++ * ++ * Additionally, this function is aware of the backup-subflows. ++ */ ++struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, ++ bool zero_wnd_test) ++{ ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sock *sk; ++ bool looping = false, force; ++ ++ /* Answer data_fin on same subflow!!! */ ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN && ++ skb && mptcp_is_data_fin(skb)) { ++ struct mptcp_tcp_sock *mptcp; ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ sk = mptcp_to_sock(mptcp); ++ ++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && ++ mptcp_is_available(sk, skb, zero_wnd_test)) ++ return sk; ++ } ++ } ++ ++ /* Find the best subflow */ ++restart: ++ sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_active, ++ zero_wnd_test, &force); ++ if (force) ++ /* one unused active sk or one NULL sk when there is at least ++ * one temporally unavailable unused active sk ++ */ ++ return sk; ++ ++ sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_backup, ++ zero_wnd_test, &force); ++ if (!force && skb) { ++ /* one used backup sk or one NULL sk where there is no one ++ * temporally unavailable unused backup sk ++ * ++ * the skb passed through all the available active and backups ++ * sks, so clean the path mask ++ */ ++ TCP_SKB_CB(skb)->path_mask = 0; ++ ++ if (!looping) { ++ looping = true; ++ goto restart; ++ } ++ } ++ return sk; ++} ++EXPORT_SYMBOL_GPL(get_available_subflow); ++ ++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) ++{ ++ struct sock *meta_sk; ++ const struct tcp_sock *tp = tcp_sk(sk); ++ struct mptcp_tcp_sock *mptcp; ++ struct sk_buff *skb_head; ++ struct defsched_priv *def_p = defsched_get_priv(tp); ++ ++ meta_sk = mptcp_meta_sk(sk); ++ skb_head = tcp_rtx_queue_head(meta_sk); ++ ++ if (!skb_head) ++ return NULL; ++ ++ /* If penalization is optional (coming from mptcp_next_segment() and ++ * We are not send-buffer-limited we do not penalize. The retransmission ++ * is just an optimization to fix the idle-time due to the delay before ++ * we wake up the application. ++ */ ++ if (!penal && sk_stream_memory_free(meta_sk)) ++ goto retrans; ++ ++ /* Only penalize again after an RTT has elapsed */ ++ if (tcp_jiffies32 - def_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) ++ goto retrans; ++ ++ /* Half the cwnd of the slow flows */ ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ struct tcp_sock *tp_it = mptcp->tp; ++ ++ if (tp_it != tp && ++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { ++ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { ++ u32 prior_cwnd = tp_it->snd_cwnd; ++ ++ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); ++ ++ /* If in slow start, do not reduce the ssthresh */ ++ if (prior_cwnd >= tp_it->snd_ssthresh) ++ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); ++ ++ def_p->last_rbuf_opti = tcp_jiffies32; ++ } ++ } ++ } ++ ++retrans: ++ ++ /* Segment not yet injected into this path? Take it!!! */ ++ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { ++ bool do_retrans = false; ++ mptcp_for_each_sub(tp->mpcb, mptcp) { ++ struct tcp_sock *tp_it = mptcp->tp; ++ ++ if (tp_it != tp && ++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { ++ if (tp_it->snd_cwnd <= 4) { ++ do_retrans = true; ++ break; ++ } ++ ++ if (4 * tp->srtt_us >= tp_it->srtt_us) { ++ do_retrans = false; ++ break; ++ } else { ++ do_retrans = true; ++ } ++ } ++ } ++ ++ if (do_retrans && mptcp_is_available(sk, skb_head, false)) { ++ trace_mptcp_retransmit(sk, skb_head); ++ return skb_head; ++ } ++ } ++ return NULL; ++} ++ ++/* Returns the next segment to be sent from the mptcp meta-queue. ++ * (chooses the reinject queue if any segment is waiting in it, otherwise, ++ * chooses the normal write queue). ++ * Sets *@reinject to 1 if the returned segment comes from the ++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, ++ * and sets it to -1 if it is a meta-level retransmission to optimize the ++ * receive-buffer. ++ */ ++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject) ++{ ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; ++ struct sk_buff *skb = NULL; ++ ++ *reinject = 0; ++ ++ /* If we are in fallback-mode, just take from the meta-send-queue */ ++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) ++ return tcp_send_head(meta_sk); ++ ++ skb = skb_peek(&mpcb->reinject_queue); ++ ++ if (skb) { ++ *reinject = 1; ++ } else { ++ skb = tcp_send_head(meta_sk); ++ ++ if (!skb && meta_sk->sk_socket && ++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && ++ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { ++ struct sock *subsk; ++ ++ /* meta is send buffer limited */ ++ tcp_chrono_start(meta_sk, TCP_CHRONO_SNDBUF_LIMITED); ++ ++ subsk = mpcb->sched_ops->get_subflow(meta_sk, ++ NULL, false); ++ if (!subsk) ++ return NULL; ++ ++ skb = mptcp_rcv_buf_optimization(subsk, 0); ++ if (skb) ++ *reinject = -1; ++ else ++ tcp_chrono_start(subsk, ++ TCP_CHRONO_SNDBUF_LIMITED); ++ } ++ } ++ return skb; ++} ++ ++struct sk_buff *mptcp_next_segment(struct sock *meta_sk, ++ int *reinject, ++ struct sock **subsk, ++ unsigned int *limit) ++{ ++ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); ++ unsigned int mss_now; ++ u32 max_len, gso_max_segs, max_segs, max_tso_segs, window; ++ struct tcp_sock *subtp; ++ int queued; ++ ++ /* As we set it, we have to reset it as well. */ ++ *limit = 0; ++ ++ if (!skb) ++ return NULL; ++ ++ *subsk = tcp_sk(meta_sk)->mpcb->sched_ops->get_subflow(meta_sk, skb, false); ++ if (!*subsk) ++ return NULL; ++ ++ subtp = tcp_sk(*subsk); ++ mss_now = tcp_current_mss(*subsk); ++ ++ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { ++ /* an active flow is selected, but segment will not be sent due ++ * to no more space in send window ++ * this means the meta is receive window limited ++ * the subflow might also be, if we have nothing to reinject ++ */ ++ tcp_chrono_start(meta_sk, TCP_CHRONO_RWND_LIMITED); ++ skb = mptcp_rcv_buf_optimization(*subsk, 1); ++ if (skb) ++ *reinject = -1; ++ else ++ return NULL; ++ } ++ ++ if (!*reinject) { ++ /* this will stop any other chronos on the meta */ ++ tcp_chrono_start(meta_sk, TCP_CHRONO_BUSY); ++ } ++ ++ /* No splitting required, as we will only send one single segment */ ++ if (skb->len <= mss_now) ++ return skb; ++ ++ max_tso_segs = tcp_tso_segs(*subsk, tcp_current_mss(*subsk)); ++ queued = mptcp_subflow_queued(*subsk, max_tso_segs); ++ ++ /* this condition should already have been established in ++ * mptcp_is_temp_unavailable when selecting available flows ++ */ ++ WARN_ONCE(subtp->snd_cwnd <= queued, "Selected subflow no cwnd room"); ++ ++ gso_max_segs = (*subsk)->sk_gso_max_segs; ++ if (!gso_max_segs) /* No gso supported on the subflow's NIC */ ++ gso_max_segs = 1; ++ ++ max_segs = min_t(unsigned int, subtp->snd_cwnd - queued, gso_max_segs); ++ if (!max_segs) ++ return NULL; ++ ++ /* if there is room for a segment, schedule up to a complete TSO ++ * segment to avoid TSO splitting. Even if it is more than allowed by ++ * the congestion window. ++ */ ++ max_segs = max_t(unsigned int, max_tso_segs, max_segs); ++ ++ max_len = min(mss_now * max_segs, skb->len); ++ ++ window = tcp_wnd_end(subtp) - subtp->write_seq; ++ ++ /* max_len now also respects the announced receive-window */ ++ max_len = min(max_len, window); ++ ++ *limit = max_len; ++ ++ return skb; ++} ++EXPORT_SYMBOL_GPL(mptcp_next_segment); ++ ++static void defsched_init(struct sock *sk) ++{ ++ struct defsched_priv *def_p = defsched_get_priv(tcp_sk(sk)); ++ ++ def_p->last_rbuf_opti = tcp_jiffies32; ++} ++ ++struct mptcp_sched_ops mptcp_sched_default = { ++ .get_subflow = get_available_subflow, ++ .next_segment = mptcp_next_segment, ++ .init = defsched_init, ++ .name = "default", ++ .owner = THIS_MODULE, ++}; ++ ++static struct mptcp_sched_ops *mptcp_sched_find(const char *name) ++{ ++ struct mptcp_sched_ops *e; ++ ++ list_for_each_entry_rcu(e, &mptcp_sched_list, list) { ++ if (strcmp(e->name, name) == 0) ++ return e; ++ } ++ ++ return NULL; ++} ++ ++int mptcp_register_scheduler(struct mptcp_sched_ops *sched) ++{ ++ int ret = 0; ++ ++ if (!sched->get_subflow || !sched->next_segment) ++ return -EINVAL; ++ ++ spin_lock(&mptcp_sched_list_lock); ++ if (mptcp_sched_find(sched->name)) { ++ pr_notice("%s already registered\n", sched->name); ++ ret = -EEXIST; ++ } else { ++ list_add_tail_rcu(&sched->list, &mptcp_sched_list); ++ pr_info("%s registered\n", sched->name); ++ } ++ spin_unlock(&mptcp_sched_list_lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(mptcp_register_scheduler); ++ ++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) ++{ ++ spin_lock(&mptcp_sched_list_lock); ++ list_del_rcu(&sched->list); ++ spin_unlock(&mptcp_sched_list_lock); ++ ++ /* Wait for outstanding readers to complete before the ++ * module gets removed entirely. ++ * ++ * A try_module_get() should fail by now as our module is ++ * in "going" state since no refs are held anymore and ++ * module_exit() handler being called. ++ */ ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler); ++ ++void mptcp_get_default_scheduler(char *name) ++{ ++ struct mptcp_sched_ops *sched; ++ ++ BUG_ON(list_empty(&mptcp_sched_list)); ++ ++ rcu_read_lock(); ++ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list); ++ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX); ++ rcu_read_unlock(); ++} ++ ++int mptcp_set_default_scheduler(const char *name) ++{ ++ struct mptcp_sched_ops *sched; ++ int ret = -ENOENT; ++ ++ spin_lock(&mptcp_sched_list_lock); ++ sched = mptcp_sched_find(name); ++#ifdef CONFIG_MODULES ++ if (!sched && capable(CAP_NET_ADMIN)) { ++ spin_unlock(&mptcp_sched_list_lock); ++ ++ request_module("mptcp_%s", name); ++ spin_lock(&mptcp_sched_list_lock); ++ sched = mptcp_sched_find(name); ++ } ++#endif ++ ++ if (sched) { ++ list_move(&sched->list, &mptcp_sched_list); ++ ret = 0; ++ } else { ++ pr_info("%s is not available\n", name); ++ } ++ spin_unlock(&mptcp_sched_list_lock); ++ ++ return ret; ++} ++ ++/* Must be called with rcu lock held */ ++static struct mptcp_sched_ops *__mptcp_sched_find_autoload(const char *name) ++{ ++ struct mptcp_sched_ops *sched = mptcp_sched_find(name); ++#ifdef CONFIG_MODULES ++ if (!sched && capable(CAP_NET_ADMIN)) { ++ rcu_read_unlock(); ++ request_module("mptcp_%s", name); ++ rcu_read_lock(); ++ sched = mptcp_sched_find(name); ++ } ++#endif ++ return sched; ++} ++ ++void mptcp_init_scheduler(struct mptcp_cb *mpcb) ++{ ++ struct mptcp_sched_ops *sched; ++ struct sock *meta_sk = mpcb->meta_sk; ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk); ++ ++ rcu_read_lock(); ++ /* if scheduler was set using socket option */ ++ if (meta_tp->mptcp_sched_setsockopt) { ++ sched = __mptcp_sched_find_autoload(meta_tp->mptcp_sched_name); ++ if (sched && try_module_get(sched->owner)) { ++ mpcb->sched_ops = sched; ++ goto out; ++ } ++ } ++ ++ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { ++ if (try_module_get(sched->owner)) { ++ mpcb->sched_ops = sched; ++ break; ++ } ++ } ++out: ++ rcu_read_unlock(); ++} ++ ++/* Change scheduler for socket */ ++int mptcp_set_scheduler(struct sock *sk, const char *name) ++{ ++ struct mptcp_sched_ops *sched; ++ int err = 0; ++ ++ rcu_read_lock(); ++ sched = __mptcp_sched_find_autoload(name); ++ ++ if (!sched) { ++ err = -ENOENT; ++ } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ++ err = -EPERM; ++ } else { ++ strcpy(tcp_sk(sk)->mptcp_sched_name, name); ++ tcp_sk(sk)->mptcp_sched_setsockopt = 1; ++ } ++ rcu_read_unlock(); ++ ++ return err; ++} ++ ++/* Manage refcounts on socket close. */ ++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb) ++{ ++ module_put(mpcb->sched_ops->owner); ++} ++ ++/* Set default value from kernel configuration at bootup */ ++static int __init mptcp_scheduler_default(void) ++{ ++ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE); ++ ++ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED); ++} ++late_initcall(mptcp_scheduler_default); +diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c +new file mode 100644 +index 000000000000..787ddaab98a2 +--- /dev/null ++++ b/net/mptcp/mptcp_wvegas.c +@@ -0,0 +1,271 @@ ++/* ++ * MPTCP implementation - WEIGHTED VEGAS ++ * ++ * Algorithm design: ++ * Yu Cao ++ * Mingwei Xu ++ * Xiaoming Fu ++ * ++ * Implementation: ++ * Yu Cao ++ * Enhuan Dong ++ * ++ * Ported to the official MPTCP-kernel: ++ * Christoph Paasch ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static int initial_alpha = 2; ++static int total_alpha = 10; ++static int gamma = 1; ++ ++module_param(initial_alpha, int, 0644); ++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows"); ++module_param(total_alpha, int, 0644); ++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows"); ++module_param(gamma, int, 0644); ++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); ++ ++#define MPTCP_WVEGAS_SCALE 16 ++ ++/* wVegas variables */ ++struct wvegas { ++ u32 beg_snd_nxt; /* right edge during last RTT */ ++ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */ ++ ++ u16 cnt_rtt; /* # of RTTs measured within last RTT */ ++ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */ ++ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */ ++ ++ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */ ++ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */ ++ int alpha; /* alpha for each subflows */ ++ ++ u32 queue_delay; /* queue delay*/ ++}; ++ ++ ++static inline u64 mptcp_wvegas_scale(u32 val, int scale) ++{ ++ return (u64) val << scale; ++} ++ ++static void wvegas_enable(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ struct wvegas *wvegas = inet_csk_ca(sk); ++ ++ wvegas->doing_wvegas_now = 1; ++ ++ wvegas->beg_snd_nxt = tp->snd_nxt; ++ ++ wvegas->cnt_rtt = 0; ++ wvegas->sampled_rtt = 0; ++ ++ wvegas->instant_rate = 0; ++ wvegas->alpha = initial_alpha; ++ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE); ++ ++ wvegas->queue_delay = 0; ++} ++ ++static inline void wvegas_disable(const struct sock *sk) ++{ ++ struct wvegas *wvegas = inet_csk_ca(sk); ++ ++ wvegas->doing_wvegas_now = 0; ++} ++ ++static void mptcp_wvegas_init(struct sock *sk) ++{ ++ struct wvegas *wvegas = inet_csk_ca(sk); ++ ++ wvegas->base_rtt = 0x7fffffff; ++ wvegas_enable(sk); ++} ++ ++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us) ++{ ++ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us); ++} ++ ++static void mptcp_wvegas_pkts_acked(struct sock *sk, ++ const struct ack_sample *sample) ++{ ++ struct wvegas *wvegas = inet_csk_ca(sk); ++ u32 vrtt; ++ ++ if (sample->rtt_us < 0) ++ return; ++ ++ vrtt = sample->rtt_us + 1; ++ ++ if (vrtt < wvegas->base_rtt) ++ wvegas->base_rtt = vrtt; ++ ++ wvegas->sampled_rtt += vrtt; ++ wvegas->cnt_rtt++; ++} ++ ++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state) ++{ ++ if (ca_state == TCP_CA_Open) ++ wvegas_enable(sk); ++ else ++ wvegas_disable(sk); ++} ++ ++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) ++{ ++ if (event == CA_EVENT_CWND_RESTART) { ++ mptcp_wvegas_init(sk); ++ } else if (event == CA_EVENT_LOSS) { ++ struct wvegas *wvegas = inet_csk_ca(sk); ++ wvegas->instant_rate = 0; ++ } ++} ++ ++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp) ++{ ++ return min(tp->snd_ssthresh, tp->snd_cwnd); ++} ++ ++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk) ++{ ++ u64 total_rate = 0; ++ const struct wvegas *wvegas = inet_csk_ca(sk); ++ struct mptcp_tcp_sock *mptcp; ++ ++ if (!mpcb) ++ return wvegas->weight; ++ ++ ++ mptcp_for_each_sub(mpcb, mptcp) { ++ struct sock *sub_sk = mptcp_to_sock(mptcp); ++ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk); ++ ++ /* sampled_rtt is initialized by 0 */ ++ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0)) ++ total_rate += sub_wvegas->instant_rate; ++ } ++ ++ if (total_rate && wvegas->instant_rate) ++ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate); ++ else ++ return wvegas->weight; ++} ++ ++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct wvegas *wvegas = inet_csk_ca(sk); ++ ++ if (!wvegas->doing_wvegas_now) { ++ tcp_reno_cong_avoid(sk, ack, acked); ++ return; ++ } ++ ++ if (after(ack, wvegas->beg_snd_nxt)) { ++ wvegas->beg_snd_nxt = tp->snd_nxt; ++ ++ if (wvegas->cnt_rtt <= 2) { ++ tcp_reno_cong_avoid(sk, ack, acked); ++ } else { ++ u32 rtt, diff, q_delay; ++ u64 target_cwnd; ++ ++ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt; ++ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt); ++ ++ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt); ++ ++ if (diff > gamma && tcp_in_slow_start(tp)) { ++ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); ++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); ++ ++ } else if (tcp_in_slow_start(tp)) { ++ tcp_slow_start(tp, acked); ++ } else { ++ if (diff >= wvegas->alpha) { ++ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt); ++ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk); ++ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE)); ++ } ++ if (diff > wvegas->alpha) { ++ tp->snd_cwnd--; ++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); ++ } else if (diff < wvegas->alpha) { ++ tp->snd_cwnd++; ++ } ++ ++ /* Try to drain link queue if needed*/ ++ q_delay = rtt - wvegas->base_rtt; ++ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay)) ++ wvegas->queue_delay = q_delay; ++ ++ if (q_delay >= 2 * wvegas->queue_delay) { ++ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt); ++ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE; ++ wvegas->queue_delay = 0; ++ } ++ } ++ ++ if (tp->snd_cwnd < 2) ++ tp->snd_cwnd = 2; ++ else if (tp->snd_cwnd > tp->snd_cwnd_clamp) ++ tp->snd_cwnd = tp->snd_cwnd_clamp; ++ ++ tp->snd_ssthresh = tcp_current_ssthresh(sk); ++ } ++ ++ wvegas->cnt_rtt = 0; ++ wvegas->sampled_rtt = 0; ++ } ++ /* Use normal slow start */ ++ else if (tcp_in_slow_start(tp)) ++ tcp_slow_start(tp, acked); ++} ++ ++ ++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = { ++ .init = mptcp_wvegas_init, ++ .ssthresh = tcp_reno_ssthresh, ++ .cong_avoid = mptcp_wvegas_cong_avoid, ++ .undo_cwnd = tcp_reno_undo_cwnd, ++ .pkts_acked = mptcp_wvegas_pkts_acked, ++ .set_state = mptcp_wvegas_state, ++ .cwnd_event = mptcp_wvegas_cwnd_event, ++ ++ .owner = THIS_MODULE, ++ .name = "wvegas", ++}; ++ ++static int __init mptcp_wvegas_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE); ++ tcp_register_congestion_control(&mptcp_wvegas); ++ return 0; ++} ++ ++static void __exit mptcp_wvegas_unregister(void) ++{ ++ tcp_unregister_congestion_control(&mptcp_wvegas); ++} ++ ++module_init(mptcp_wvegas_register); ++module_exit(mptcp_wvegas_unregister); ++ ++MODULE_AUTHOR("Yu Cao, Enhuan Dong"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MPTCP wVegas"); ++MODULE_VERSION("0.1"); +diff --git a/net/socket.c b/net/socket.c +index 94358566c9d1..a26eeeda2b4d 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -91,6 +91,7 @@ + #include + + #include ++#include + #include + #include + +@@ -1339,6 +1340,7 @@ int __sock_create(struct net *net, int family, int type, int protocol, + int err; + struct socket *sock; + const struct net_proto_family *pf; ++ int old_protocol = protocol; + + /* + * Check protocol is in range +@@ -1359,6 +1361,9 @@ int __sock_create(struct net *net, int family, int type, int protocol, + family = PF_PACKET; + } + ++ if (old_protocol == IPPROTO_MPTCP) ++ protocol = IPPROTO_TCP; ++ + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; +@@ -1408,6 +1413,10 @@ int __sock_create(struct net *net, int family, int type, int protocol, + if (err < 0) + goto out_module_put; + ++ if (sysctl_mptcp_enabled && old_protocol == IPPROTO_MPTCP && ++ type == SOCK_STREAM && (family == AF_INET || family == AF_INET6)) ++ mptcp_enable_sock(sock->sk); ++ + /* + * Now to bump the refcnt of the [loadable] module that owns this + * socket at sock_release time we decrement its refcnt. +diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h +index 63038eb23560..7150eb62db86 100644 +--- a/tools/include/uapi/linux/bpf.h ++++ b/tools/include/uapi/linux/bpf.h +@@ -3438,6 +3438,7 @@ enum { + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, ++ BPF_TCP_RST_WAIT, + + BPF_TCP_MAX_STATES /* Leave at the end! */ + }; diff --git a/root/target/linux/generic/hack-5.4/700-swconfig_switch_drivers.patch b/root/target/linux/generic/hack-5.4/700-swconfig_switch_drivers.patch deleted file mode 100755 index f30ad81e..00000000 --- a/root/target/linux/generic/hack-5.4/700-swconfig_switch_drivers.patch +++ /dev/null @@ -1,135 +0,0 @@ -From 36e516290611e613aa92996cb4339561452695b4 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:24:23 +0200 -Subject: net: swconfig: adds openwrt switch layer - -Signed-off-by: Felix Fietkau ---- - drivers/net/phy/Kconfig | 83 +++++++++++++++++++++++++++++++++++++++++++++++ - drivers/net/phy/Makefile | 15 +++++++++ - include/uapi/linux/Kbuild | 1 + - 3 files changed, 99 insertions(+) - ---- a/drivers/net/phy/Kconfig -+++ b/drivers/net/phy/Kconfig -@@ -250,6 +250,85 @@ config LED_TRIGGER_PHY - for any speed known to the PHY. - - -+comment "Switch configuration API + drivers" -+ -+config SWCONFIG -+ tristate "Switch configuration API" -+ ---help--- -+ Switch configuration API using netlink. This allows -+ you to configure the VLAN features of certain switches. -+ -+config SWCONFIG_LEDS -+ bool "Switch LED trigger support" -+ depends on (SWCONFIG && LEDS_TRIGGERS) -+ -+config ADM6996_PHY -+ tristate "Driver for ADM6996 switches" -+ select SWCONFIG -+ ---help--- -+ Currently supports the ADM6996FC and ADM6996M switches. -+ Support for FC is very limited. -+ -+config AR8216_PHY -+ tristate "Driver for Atheros AR8216 switches" -+ select ETHERNET_PACKET_MANGLE -+ select SWCONFIG -+ -+config AR8216_PHY_LEDS -+ bool "Atheros AR8216 switch LED support" -+ depends on (AR8216_PHY && LEDS_CLASS) -+ -+source "drivers/net/phy/b53/Kconfig" -+ -+config IP17XX_PHY -+ tristate "Driver for IC+ IP17xx switches" -+ select SWCONFIG -+ -+config MVSWITCH_PHY -+ tristate "Driver for Marvell 88E6060 switches" -+ select ETHERNET_PACKET_MANGLE -+ -+config PSB6970_PHY -+ tristate "Lantiq XWAY Tantos (PSB6970) Ethernet switch" -+ select SWCONFIG -+ select ETHERNET_PACKET_MANGLE -+ -+config RTL8306_PHY -+ tristate "Driver for Realtek RTL8306S switches" -+ select SWCONFIG -+ -+config RTL8366_SMI -+ tristate "Driver for the RTL8366 SMI interface" -+ depends on GPIOLIB -+ ---help--- -+ This module implements the SMI interface protocol which is used -+ by some RTL8366 ethernet switch devices via the generic GPIO API. -+ -+if RTL8366_SMI -+ -+config RTL8366_SMI_DEBUG_FS -+ bool "RTL8366 SMI interface debugfs support" -+ depends on DEBUG_FS -+ default n -+ -+config RTL8366S_PHY -+ tristate "Driver for the Realtek RTL8366S switch" -+ select SWCONFIG -+ -+config RTL8366RB_PHY -+ tristate "Driver for the Realtek RTL8366RB switch" -+ select SWCONFIG -+ -+config RTL8367_PHY -+ tristate "Driver for the Realtek RTL8367R/M switches" -+ select SWCONFIG -+ -+config RTL8367B_PHY -+ tristate "Driver fot the Realtek RTL8367R-VB switch" -+ select SWCONFIG -+ -+endif # RTL8366_SMI -+ - comment "MII PHY device drivers" - - config SFP ---- a/drivers/net/phy/Makefile -+++ b/drivers/net/phy/Makefile -@@ -22,6 +22,20 @@ libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_ - obj-$(CONFIG_PHYLINK) += phylink.o - obj-$(CONFIG_PHYLIB) += libphy.o - -+obj-$(CONFIG_SWCONFIG) += swconfig.o -+obj-$(CONFIG_ADM6996_PHY) += adm6996.o -+obj-$(CONFIG_AR8216_PHY) += ar8216.o ar8327.o -+obj-$(CONFIG_SWCONFIG_B53) += b53/ -+obj-$(CONFIG_IP17XX_PHY) += ip17xx.o -+obj-$(CONFIG_MVSWITCH_PHY) += mvswitch.o -+obj-$(CONFIG_PSB6970_PHY) += psb6970.o -+obj-$(CONFIG_RTL8306_PHY) += rtl8306.o -+obj-$(CONFIG_RTL8366_SMI) += rtl8366_smi.o -+obj-$(CONFIG_RTL8366S_PHY) += rtl8366s.o -+obj-$(CONFIG_RTL8366RB_PHY) += rtl8366rb.o -+obj-$(CONFIG_RTL8367_PHY) += rtl8367.o -+obj-$(CONFIG_RTL8367B_PHY) += rtl8367b.o -+ - obj-$(CONFIG_MDIO_ASPEED) += mdio-aspeed.o - obj-$(CONFIG_MDIO_BCM_IPROC) += mdio-bcm-iproc.o - obj-$(CONFIG_MDIO_BCM_UNIMAC) += mdio-bcm-unimac.o ---- a/include/linux/platform_data/b53.h -+++ b/include/linux/platform_data/b53.h -@@ -29,6 +29,9 @@ struct b53_platform_data { - u32 chip_id; - u16 enabled_ports; - -+ /* allow to specify an ethX alias */ -+ const char *alias; -+ - /* only used by MMAP'd driver */ - unsigned big_endian:1; - void __iomem *regs; diff --git a/root/target/linux/generic/hack-5.4/703-add_vsc8504_support.patch b/root/target/linux/generic/hack-5.4/703-add_vsc8504_support.patch deleted file mode 100755 index afb6ca6c..00000000 --- a/root/target/linux/generic/hack-5.4/703-add_vsc8504_support.patch +++ /dev/null @@ -1,57 +0,0 @@ -From: Roman Kuzmitskii -Date: Thu, 05 Nov 2020 02:00:00 +0000 -Subject: [PATCH] net: phy: vitesse: add vsc8504 support - -This patch adds support for vsc8504 phy. -That phy is changed owner: - vitesse -> microsemi -> microchip -So is its driver in kernel was changed and rewritten. - -there is no need to upstream this patch. -this vsc8504 is supported by newer kernels out of box. -support could be enabled by CONFIG_MICROSEMI_PHY. - -Tested-by: Johannes Kimmel -Signed-off-by: Roman Kuzmitskii ---- a/drivers/net/phy/vitesse.c -+++ b/drivers/net/phy/vitesse.c -@@ -61,6 +61,7 @@ - - #define PHY_ID_VSC8234 0x000fc620 - #define PHY_ID_VSC8244 0x000fc6c0 -+#define PHY_ID_VSC8504 0x000704c2 - #define PHY_ID_VSC8572 0x000704d0 - #define PHY_ID_VSC8601 0x00070420 - #define PHY_ID_VSC7385 0x00070450 -@@ -292,6 +293,7 @@ static int vsc82xx_config_intr(struct ph - err = phy_write(phydev, MII_VSC8244_IMASK, - (phydev->drv->phy_id == PHY_ID_VSC8234 || - phydev->drv->phy_id == PHY_ID_VSC8244 || -+ phydev->drv->phy_id == PHY_ID_VSC8504 || - phydev->drv->phy_id == PHY_ID_VSC8572 || - phydev->drv->phy_id == PHY_ID_VSC8601) ? - MII_VSC8244_IMASK_MASK : -@@ -402,6 +404,15 @@ static struct phy_driver vsc82xx_driver[ - .ack_interrupt = &vsc824x_ack_interrupt, - .config_intr = &vsc82xx_config_intr, - }, { -+ .phy_id = PHY_ID_VSC8504, -+ .name = "Vitesse VSC8504", -+ .phy_id_mask = 0x000ffff0, -+ /* PHY_GBIT_FEATURES */ -+ .config_init = &vsc824x_config_init, -+ .config_aneg = &vsc82x4_config_aneg, -+ .ack_interrupt = &vsc824x_ack_interrupt, -+ .config_intr = &vsc82xx_config_intr, -+}, { - .phy_id = PHY_ID_VSC8572, - .name = "Vitesse VSC8572", - .phy_id_mask = 0x000ffff0, -@@ -488,6 +499,7 @@ module_phy_driver(vsc82xx_driver); - static struct mdio_device_id __maybe_unused vitesse_tbl[] = { - { PHY_ID_VSC8234, 0x000ffff0 }, - { PHY_ID_VSC8244, 0x000fffc0 }, -+ { PHY_ID_VSC8504, 0x000ffff0 }, - { PHY_ID_VSC8572, 0x000ffff0 }, - { PHY_ID_VSC7385, 0x000ffff0 }, - { PHY_ID_VSC7388, 0x000ffff0 }, diff --git a/root/target/linux/generic/hack-5.4/710-net-dsa-mv88e6xxx-default-VID-1.patch b/root/target/linux/generic/hack-5.4/710-net-dsa-mv88e6xxx-default-VID-1.patch deleted file mode 100755 index 5dc5ac68..00000000 --- a/root/target/linux/generic/hack-5.4/710-net-dsa-mv88e6xxx-default-VID-1.patch +++ /dev/null @@ -1,18 +0,0 @@ ---- a/drivers/net/dsa/mv88e6xxx/chip.c -+++ b/drivers/net/dsa/mv88e6xxx/chip.c -@@ -1930,6 +1930,7 @@ static int mv88e6xxx_port_fdb_add(struct - struct mv88e6xxx_chip *chip = ds->priv; - int err; - -+ vid = vid ? : 1; - mv88e6xxx_reg_lock(chip); - err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, - MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC); -@@ -1944,6 +1945,7 @@ static int mv88e6xxx_port_fdb_del(struct - struct mv88e6xxx_chip *chip = ds->priv; - int err; - -+ vid = vid ? : 1; - mv88e6xxx_reg_lock(chip); - err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, 0); - mv88e6xxx_reg_unlock(chip); diff --git a/root/target/linux/generic/hack-5.4/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch b/root/target/linux/generic/hack-5.4/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch deleted file mode 100755 index 1da388c8..00000000 --- a/root/target/linux/generic/hack-5.4/711-net-dsa-mv88e6xxx-disable-ATU-violation.patch +++ /dev/null @@ -1,12 +0,0 @@ ---- a/drivers/net/dsa/mv88e6xxx/chip.c -+++ b/drivers/net/dsa/mv88e6xxx/chip.c -@@ -2492,6 +2492,9 @@ static int mv88e6xxx_setup_port(struct m - if (dsa_is_cpu_port(ds, port)) - reg = 0; - -+ /* Disable ATU member violation interrupt */ -+ reg |= MV88E6XXX_PORT_ASSOC_VECTOR_IGNORE_WRONG; -+ - err = mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_ASSOC_VECTOR, - reg); - if (err) diff --git a/root/target/linux/generic/hack-5.4/721-phy_packets.patch b/root/target/linux/generic/hack-5.4/721-phy_packets.patch deleted file mode 100755 index 89ff8ea4..00000000 --- a/root/target/linux/generic/hack-5.4/721-phy_packets.patch +++ /dev/null @@ -1,176 +0,0 @@ -From ffe387740bbe88dd88bbe04d6375902708003d6e Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Fri, 7 Jul 2017 17:25:00 +0200 -Subject: net: add packet mangeling patch - -Signed-off-by: Felix Fietkau ---- - include/linux/netdevice.h | 11 +++++++++++ - include/linux/skbuff.h | 14 ++++---------- - net/Kconfig | 6 ++++++ - net/core/dev.c | 18 ++++++++++++++---- - net/core/skbuff.c | 17 +++++++++++++++++ - net/ethernet/eth.c | 6 ++++++ - 6 files changed, 58 insertions(+), 14 deletions(-) - ---- a/include/linux/netdevice.h -+++ b/include/linux/netdevice.h -@@ -1540,6 +1540,7 @@ enum netdev_priv_flags { - IFF_FAILOVER_SLAVE = 1<<28, - IFF_L3MDEV_RX_HANDLER = 1<<29, - IFF_LIVE_RENAME_OK = 1<<30, -+ IFF_NO_IP_ALIGN = 1<<31, - }; - - #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN -@@ -1572,6 +1573,7 @@ enum netdev_priv_flags { - #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE - #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER - #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK -+#define IFF_NO_IP_ALIGN IFF_NO_IP_ALIGN - - /* Specifies the type of the struct net_device::ml_priv pointer */ - enum netdev_ml_priv_type { -@@ -1882,6 +1884,11 @@ struct net_device { - const struct tlsdev_ops *tlsdev_ops; - #endif - -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ void (*eth_mangle_rx)(struct net_device *dev, struct sk_buff *skb); -+ struct sk_buff *(*eth_mangle_tx)(struct net_device *dev, struct sk_buff *skb); -+#endif -+ - const struct header_ops *header_ops; - - unsigned int flags; -@@ -1964,6 +1971,10 @@ struct net_device { - struct mpls_dev __rcu *mpls_ptr; - #endif - -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ void *phy_ptr; /* PHY device specific data */ -+#endif -+ - /* - * Cache lines mostly used on receive path (including eth_type_trans()) - */ ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -2684,6 +2684,10 @@ static inline int pskb_trim(struct sk_bu - return (len < skb->len) ? __pskb_trim(skb, len) : 0; - } - -+extern struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, -+ unsigned int length, gfp_t gfp); -+ -+ - /** - * pskb_trim_unique - remove end from a paged unique (not cloned) buffer - * @skb: buffer to alter -@@ -2815,16 +2819,6 @@ static inline struct sk_buff *dev_alloc_ - } - - --static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, -- unsigned int length, gfp_t gfp) --{ -- struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); -- -- if (NET_IP_ALIGN && skb) -- skb_reserve(skb, NET_IP_ALIGN); -- return skb; --} -- - static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, - unsigned int length) - { ---- a/net/Kconfig -+++ b/net/Kconfig -@@ -26,6 +26,12 @@ menuconfig NET - - if NET - -+config ETHERNET_PACKET_MANGLE -+ bool -+ help -+ This option can be selected by phy drivers that need to mangle -+ packets going in or out of an ethernet device. -+ - config WANT_COMPAT_NETLINK_MESSAGES - bool - help ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -3221,10 +3221,20 @@ static int xmit_one(struct sk_buff *skb, - if (dev_nit_active(dev)) - dev_queue_xmit_nit(skb, dev); - -- len = skb->len; -- trace_net_dev_start_xmit(skb, dev); -- rc = netdev_start_xmit(skb, dev, txq, more); -- trace_net_dev_xmit(skb, rc, dev, len); -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ if (!dev->eth_mangle_tx || -+ (skb = dev->eth_mangle_tx(dev, skb)) != NULL) -+#else -+ if (1) -+#endif -+ { -+ len = skb->len; -+ trace_net_dev_start_xmit(skb, dev); -+ rc = netdev_start_xmit(skb, dev, txq, more); -+ trace_net_dev_xmit(skb, rc, dev, len); -+ } else { -+ rc = NETDEV_TX_OK; -+ } - - return rc; - } ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -60,6 +60,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -549,6 +550,22 @@ skb_fail: - } - EXPORT_SYMBOL(__napi_alloc_skb); - -+struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, -+ unsigned int length, gfp_t gfp) -+{ -+ struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); -+ -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ if (dev && (dev->priv_flags & IFF_NO_IP_ALIGN)) -+ return skb; -+#endif -+ -+ if (NET_IP_ALIGN && skb) -+ skb_reserve(skb, NET_IP_ALIGN); -+ return skb; -+} -+EXPORT_SYMBOL(__netdev_alloc_skb_ip_align); -+ - void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize) - { ---- a/net/ethernet/eth.c -+++ b/net/ethernet/eth.c -@@ -171,6 +171,12 @@ __be16 eth_type_trans(struct sk_buff *sk - const struct ethhdr *eth; - - skb->dev = dev; -+ -+#ifdef CONFIG_ETHERNET_PACKET_MANGLE -+ if (dev->eth_mangle_rx) -+ dev->eth_mangle_rx(dev, skb); -+#endif -+ - skb_reset_mac_header(skb); - - eth = (struct ethhdr *)skb->data; diff --git a/root/target/linux/generic/hack-5.4/760-net-usb-r8152-add-LED-configuration-from-OF.patch b/root/target/linux/generic/hack-5.4/760-net-usb-r8152-add-LED-configuration-from-OF.patch deleted file mode 100755 index a96661c9..00000000 --- a/root/target/linux/generic/hack-5.4/760-net-usb-r8152-add-LED-configuration-from-OF.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 82985725e071f2a5735052f18e109a32aeac3a0b Mon Sep 17 00:00:00 2001 -From: David Bauer -Date: Sun, 26 Jul 2020 02:38:31 +0200 -Subject: [PATCH] net: usb: r8152: add LED configuration from OF - -This adds the ability to configure the LED configuration register using -OF. This way, the correct value for board specific LED configuration can -be determined. - -Signed-off-by: David Bauer ---- - drivers/net/usb/r8152.c | 23 +++++++++++++++++++++++ - 1 file changed, 23 insertions(+) - ---- a/drivers/net/usb/r8152.c -+++ b/drivers/net/usb/r8152.c -@@ -11,6 +11,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -4336,6 +4337,22 @@ static void rtl_tally_reset(struct r8152 - ocp_write_word(tp, MCU_TYPE_PLA, PLA_RSTTALLY, ocp_data); - } - -+static int r8152_led_configuration(struct r8152 *tp) -+{ -+ u32 led_data; -+ int ret; -+ -+ ret = of_property_read_u32(tp->udev->dev.of_node, "realtek,led-data", -+ &led_data); -+ -+ if (ret) -+ return ret; -+ -+ ocp_write_word(tp, MCU_TYPE_PLA, PLA_LEDSEL, led_data); -+ -+ return 0; -+} -+ - static void r8152b_init(struct r8152 *tp) - { - u32 ocp_data; -@@ -4377,6 +4394,8 @@ static void r8152b_init(struct r8152 *tp - ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL); - ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN); - ocp_write_word(tp, MCU_TYPE_USB, USB_USB_CTRL, ocp_data); -+ -+ r8152_led_configuration(tp); - } - - static void r8153_init(struct r8152 *tp) -@@ -4511,6 +4530,8 @@ static void r8153_init(struct r8152 *tp) - tp->coalesce = COALESCE_SLOW; - break; - } -+ -+ r8152_led_configuration(tp); - } - - static void r8153b_init(struct r8152 *tp) -@@ -4587,6 +4608,8 @@ static void r8153b_init(struct r8152 *tp - rtl_tally_reset(tp); - - tp->coalesce = 15000; /* 15 us */ -+ -+ r8152_led_configuration(tp); - } - - static int rtl8152_pre_reset(struct usb_interface *intf) diff --git a/root/target/linux/generic/hack-5.4/761-dt-bindings-net-add-RTL8152-binding-documentation.patch b/root/target/linux/generic/hack-5.4/761-dt-bindings-net-add-RTL8152-binding-documentation.patch deleted file mode 100755 index be262b99..00000000 --- a/root/target/linux/generic/hack-5.4/761-dt-bindings-net-add-RTL8152-binding-documentation.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 3ee05f4aa64fc86af3be5bc176ba5808de9260a7 Mon Sep 17 00:00:00 2001 -From: David Bauer -Date: Sun, 26 Jul 2020 15:30:33 +0200 -Subject: [PATCH] dt-bindings: net: add RTL8152 binding documentation - -Add binding documentation for the Realtek RTL8152 / RTL8153 USB ethernet -adapters. - -Signed-off-by: David Bauer ---- - .../bindings/net/realtek,rtl8152.yaml | 36 +++++++++++++++++++ - 1 file changed, 36 insertions(+) - create mode 100644 Documentation/devicetree/bindings/net/realtek,rtl8152.yaml - ---- /dev/null -+++ b/Documentation/devicetree/bindings/net/realtek,rtl8152.yaml -@@ -0,0 +1,36 @@ -+# SPDX-License-Identifier: GPL-2.0 -+%YAML 1.2 -+--- -+$id: http://devicetree.org/schemas/net/realtek,rtl8152.yaml# -+$schema: http://devicetree.org/meta-schemas/core.yaml# -+ -+title: Realtek RTL8152/RTL8153 series USB ethernet -+ -+maintainers: -+ - David Bauer -+ -+properties: -+ compatible: -+ oneOf: -+ - items: -+ - enum: -+ - realtek,rtl8152 -+ - realtek,rtl8153 -+ -+ reg: -+ description: The device number on the USB bus -+ -+ realtek,led-data: -+ description: Value to be written to the LED configuration register. -+ -+required: -+ - compatible -+ - reg -+ -+examples: -+ - | -+ usb-eth@2 { -+ compatible = "realtek,rtl8153"; -+ reg = <2>; -+ realtek,led-data = <0x87>; -+ }; -\ No newline at end of file diff --git a/root/target/linux/generic/hack-5.4/773-bgmac-add-srab-switch.patch b/root/target/linux/generic/hack-5.4/773-bgmac-add-srab-switch.patch deleted file mode 100755 index 88109ac8..00000000 --- a/root/target/linux/generic/hack-5.4/773-bgmac-add-srab-switch.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 3cb240533ab787899dc7f17aa7d6c5b4810e2e58 Mon Sep 17 00:00:00 2001 -From: Hauke Mehrtens -Date: Fri, 7 Jul 2017 17:26:01 +0200 -Subject: bcm53xx: bgmac: use srab switch driver - -use the srab switch driver on these SoCs. - -Signed-off-by: Hauke Mehrtens ---- - drivers/net/ethernet/broadcom/bgmac-bcma.c | 1 + - drivers/net/ethernet/broadcom/bgmac.c | 24 ++++++++++++++++++++++++ - drivers/net/ethernet/broadcom/bgmac.h | 4 ++++ - 3 files changed, 29 insertions(+) - ---- a/drivers/net/ethernet/broadcom/bgmac-bcma.c -+++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c -@@ -266,6 +266,7 @@ static int bgmac_probe(struct bcma_devic - bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; - bgmac->feature_flags |= BGMAC_FEAT_NO_RESET; - bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500; -+ bgmac->feature_flags |= BGMAC_FEAT_SRAB; - break; - default: - bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; ---- a/drivers/net/ethernet/broadcom/bgmac.c -+++ b/drivers/net/ethernet/broadcom/bgmac.c -@@ -12,6 +12,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -1407,6 +1408,17 @@ static const struct ethtool_ops bgmac_et - .set_link_ksettings = phy_ethtool_set_link_ksettings, - }; - -+static struct b53_platform_data bgmac_b53_pdata = { -+}; -+ -+static struct platform_device bgmac_b53_dev = { -+ .name = "b53-srab-switch", -+ .id = -1, -+ .dev = { -+ .platform_data = &bgmac_b53_pdata, -+ }, -+}; -+ - /************************************************** - * MII - **************************************************/ -@@ -1538,6 +1550,14 @@ int bgmac_enet_probe(struct bgmac *bgmac - net_dev->hw_features = net_dev->features; - net_dev->vlan_features = net_dev->features; - -+ if ((bgmac->feature_flags & BGMAC_FEAT_SRAB) && !bgmac_b53_pdata.regs) { -+ bgmac_b53_pdata.regs = ioremap_nocache(0x18007000, 0x1000); -+ -+ err = platform_device_register(&bgmac_b53_dev); -+ if (!err) -+ bgmac->b53_device = &bgmac_b53_dev; -+ } -+ - err = register_netdev(bgmac->net_dev); - if (err) { - dev_err(bgmac->dev, "Cannot register net device\n"); -@@ -1560,6 +1580,10 @@ EXPORT_SYMBOL_GPL(bgmac_enet_probe); - - void bgmac_enet_remove(struct bgmac *bgmac) - { -+ if (bgmac->b53_device) -+ platform_device_unregister(&bgmac_b53_dev); -+ bgmac->b53_device = NULL; -+ - unregister_netdev(bgmac->net_dev); - phy_disconnect(bgmac->net_dev->phydev); - netif_napi_del(&bgmac->napi); ---- a/drivers/net/ethernet/broadcom/bgmac.h -+++ b/drivers/net/ethernet/broadcom/bgmac.h -@@ -427,6 +427,7 @@ - #define BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII BIT(18) - #define BGMAC_FEAT_CC7_IF_TYPE_RGMII BIT(19) - #define BGMAC_FEAT_IDM_MASK BIT(20) -+#define BGMAC_FEAT_SRAB BIT(21) - - struct bgmac_slot_info { - union { -@@ -532,6 +533,9 @@ struct bgmac { - void (*cmn_maskset32)(struct bgmac *bgmac, u16 offset, u32 mask, - u32 set); - int (*phy_connect)(struct bgmac *bgmac); -+ -+ /* platform device for associated switch */ -+ struct platform_device *b53_device; - }; - - struct bgmac *bgmac_alloc(struct device *dev); diff --git a/root/target/linux/generic/hack-5.4/901-debloat_sock_diag.patch b/root/target/linux/generic/hack-5.4/901-debloat_sock_diag.patch deleted file mode 100755 index 0abb6726..00000000 --- a/root/target/linux/generic/hack-5.4/901-debloat_sock_diag.patch +++ /dev/null @@ -1,145 +0,0 @@ -From 3b6115d6b57a263bdc8c9b1df273bd4a7955eead Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 8 Jul 2017 08:16:31 +0200 -Subject: debloat: add some debloat patches, strip down procfs and make O_DIRECT support optional, saves ~15K after lzma on MIPS - -Signed-off-by: Felix Fietkau ---- - net/Kconfig | 3 +++ - net/core/Makefile | 3 ++- - net/core/sock.c | 2 ++ - net/ipv4/Kconfig | 1 + - net/netlink/Kconfig | 1 + - net/packet/Kconfig | 1 + - net/unix/Kconfig | 1 + - 7 files changed, 11 insertions(+), 1 deletion(-) - ---- a/net/Kconfig -+++ b/net/Kconfig -@@ -103,6 +103,9 @@ source "net/netlabel/Kconfig" - - endif # if INET - -+config SOCK_DIAG -+ bool -+ - config NETWORK_SECMARK - bool "Security Marking" - help ---- a/net/core/Makefile -+++ b/net/core/Makefile -@@ -10,9 +10,10 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core. - - obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ - neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ -- sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ -+ dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o flow_offload.o - -+obj-$(CONFIG_SOCK_DIAG) += sock_diag.o - obj-y += net-sysfs.o - obj-$(CONFIG_PAGE_POOL) += page_pool.o - obj-$(CONFIG_PROC_FS) += net-procfs.o ---- a/net/core/sock.c -+++ b/net/core/sock.c -@@ -140,6 +140,7 @@ - - static DEFINE_MUTEX(proto_list_mutex); - static LIST_HEAD(proto_list); -+static atomic64_t cookie_gen; - - static void sock_inuse_add(struct net *net, int val); - -@@ -539,6 +540,18 @@ discard_and_relse: - } - EXPORT_SYMBOL(__sk_receive_skb); - -+u64 sock_gen_cookie(struct sock *sk) -+{ -+ while (1) { -+ u64 res = atomic64_read(&sk->sk_cookie); -+ -+ if (res) -+ return res; -+ res = atomic64_inc_return(&cookie_gen); -+ atomic64_cmpxchg(&sk->sk_cookie, 0, res); -+ } -+} -+ - struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) - { - struct dst_entry *dst = __sk_dst_get(sk); -@@ -1760,9 +1773,11 @@ static void __sk_free(struct sock *sk) - if (likely(sk->sk_net_refcnt)) - sock_inuse_add(sock_net(sk), -1); - -+#ifdef CONFIG_SOCK_DIAG - if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) - sock_diag_broadcast_destroy(sk); - else -+#endif - sk_destruct(sk); - } - ---- a/net/core/sock_diag.c -+++ b/net/core/sock_diag.c -@@ -19,19 +19,6 @@ static const struct sock_diag_handler *s - static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); - static DEFINE_MUTEX(sock_diag_table_mutex); - static struct workqueue_struct *broadcast_wq; --static atomic64_t cookie_gen; -- --u64 sock_gen_cookie(struct sock *sk) --{ -- while (1) { -- u64 res = atomic64_read(&sk->sk_cookie); -- -- if (res) -- return res; -- res = atomic64_inc_return(&cookie_gen); -- atomic64_cmpxchg(&sk->sk_cookie, 0, res); -- } --} - - int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) - { ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -400,6 +400,7 @@ config INET_TUNNEL - - config INET_DIAG - tristate "INET: socket monitoring interface" -+ select SOCK_DIAG - default y - ---help--- - Support for INET (TCP, DCCP, etc) socket monitoring interface used by ---- a/net/netlink/Kconfig -+++ b/net/netlink/Kconfig -@@ -5,6 +5,7 @@ - - config NETLINK_DIAG - tristate "NETLINK: socket monitoring interface" -+ select SOCK_DIAG - default n - ---help--- - Support for NETLINK socket monitoring interface used by the ss tool. ---- a/net/packet/Kconfig -+++ b/net/packet/Kconfig -@@ -19,6 +19,7 @@ config PACKET - config PACKET_DIAG - tristate "Packet: sockets monitoring interface" - depends on PACKET -+ select SOCK_DIAG - default n - ---help--- - Support for PF_PACKET sockets monitoring interface used by the ss tool. ---- a/net/unix/Kconfig -+++ b/net/unix/Kconfig -@@ -28,6 +28,7 @@ config UNIX_SCM - config UNIX_DIAG - tristate "UNIX: socket monitoring interface" - depends on UNIX -+ select SOCK_DIAG - default n - ---help--- - Support for UNIX socket monitoring interface used by the ss tool. diff --git a/root/target/linux/generic/hack-5.4/902-debloat_proc.patch b/root/target/linux/generic/hack-5.4/902-debloat_proc.patch deleted file mode 100755 index 198b0376..00000000 --- a/root/target/linux/generic/hack-5.4/902-debloat_proc.patch +++ /dev/null @@ -1,408 +0,0 @@ -From 9e3f1d0805b2d919904dd9a4ff0d956314cc3cba Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 8 Jul 2017 08:20:09 +0200 -Subject: debloat: procfs - -Signed-off-by: Felix Fietkau ---- - fs/locks.c | 2 ++ - fs/proc/Kconfig | 5 +++++ - fs/proc/consoles.c | 3 +++ - fs/proc/proc_tty.c | 11 ++++++++++- - include/net/snmp.h | 18 +++++++++++++++++- - ipc/msg.c | 3 +++ - ipc/sem.c | 2 ++ - ipc/shm.c | 2 ++ - ipc/util.c | 3 +++ - kernel/exec_domain.c | 2 ++ - kernel/irq/proc.c | 9 +++++++++ - kernel/time/timer_list.c | 2 ++ - mm/vmalloc.c | 2 ++ - mm/vmstat.c | 8 +++++--- - net/8021q/vlanproc.c | 6 ++++++ - net/core/net-procfs.c | 18 ++++++++++++------ - net/core/sock.c | 2 ++ - net/ipv4/fib_trie.c | 18 ++++++++++++------ - net/ipv4/proc.c | 3 +++ - net/ipv4/route.c | 3 +++ - 20 files changed, 105 insertions(+), 17 deletions(-) - ---- a/fs/locks.c -+++ b/fs/locks.c -@@ -2989,6 +2989,8 @@ static const struct seq_operations locks - - static int __init proc_locks_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; - proc_create_seq_private("locks", 0, NULL, &locks_seq_operations, - sizeof(struct locks_iterator), NULL); - return 0; ---- a/fs/proc/Kconfig -+++ b/fs/proc/Kconfig -@@ -100,6 +100,11 @@ config PROC_CHILDREN - Say Y if you are running any user-space software which takes benefit from - this interface. For example, rkt is such a piece of software. - -+config PROC_STRIPPED -+ default n -+ depends on EXPERT -+ bool "Strip non-essential /proc functionality to reduce code size" -+ - config PROC_PID_ARCH_STATUS - def_bool n - depends on PROC_FS ---- a/fs/proc/consoles.c -+++ b/fs/proc/consoles.c -@@ -92,6 +92,9 @@ static const struct seq_operations conso - - static int __init proc_consoles_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; -+ - proc_create_seq("consoles", 0, NULL, &consoles_op); - return 0; - } ---- a/fs/proc/proc_tty.c -+++ b/fs/proc/proc_tty.c -@@ -133,7 +133,10 @@ static const struct seq_operations tty_d - void proc_tty_register_driver(struct tty_driver *driver) - { - struct proc_dir_entry *ent; -- -+ -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; -+ - if (!driver->driver_name || driver->proc_entry || - !driver->ops->proc_show) - return; -@@ -150,6 +153,9 @@ void proc_tty_unregister_driver(struct t - { - struct proc_dir_entry *ent; - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; -+ - ent = driver->proc_entry; - if (!ent) - return; -@@ -164,6 +170,9 @@ void proc_tty_unregister_driver(struct t - */ - void __init proc_tty_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; -+ - if (!proc_mkdir("tty", NULL)) - return; - proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */ ---- a/include/net/snmp.h -+++ b/include/net/snmp.h -@@ -118,6 +118,21 @@ struct linux_xfrm_mib { - #define DECLARE_SNMP_STAT(type, name) \ - extern __typeof__(type) __percpu *name - -+#ifdef CONFIG_PROC_STRIPPED -+#define __SNMP_STATS_DUMMY(mib) \ -+ do { (void) mib->mibs[0]; } while(0) -+ -+#define __SNMP_INC_STATS(mib, field) __SNMP_STATS_DUMMY(mib) -+#define SNMP_INC_STATS_ATOMIC_LONG(mib, field) __SNMP_STATS_DUMMY(mib) -+#define SNMP_INC_STATS(mib, field) __SNMP_STATS_DUMMY(mib) -+#define SNMP_DEC_STATS(mib, field) __SNMP_STATS_DUMMY(mib) -+#define __SNMP_ADD_STATS(mib, field, addend) __SNMP_STATS_DUMMY(mib) -+#define SNMP_ADD_STATS(mib, field, addend) __SNMP_STATS_DUMMY(mib) -+#define SNMP_UPD_PO_STATS(mib, basefield, addend) __SNMP_STATS_DUMMY(mib) -+#define __SNMP_UPD_PO_STATS(mib, basefield, addend) __SNMP_STATS_DUMMY(mib) -+ -+#else -+ - #define __SNMP_INC_STATS(mib, field) \ - __this_cpu_inc(mib->mibs[field]) - -@@ -148,8 +163,9 @@ struct linux_xfrm_mib { - __this_cpu_add(ptr[basefield##OCTETS], addend); \ - } while (0) - -+#endif - --#if BITS_PER_LONG==32 -+#if (BITS_PER_LONG==32) && !defined(CONFIG_PROC_STRIPPED) - - #define __SNMP_ADD_STATS64(mib, field, addend) \ - do { \ ---- a/ipc/msg.c -+++ b/ipc/msg.c -@@ -1317,6 +1317,9 @@ void __init msg_init(void) - { - msg_init_ns(&init_ipc_ns); - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; -+ - ipc_init_proc_interface("sysvipc/msg", - " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", - IPC_MSG_IDS, sysvipc_msg_proc_show); ---- a/ipc/sem.c -+++ b/ipc/sem.c -@@ -243,6 +243,8 @@ void sem_exit_ns(struct ipc_namespace *n - void __init sem_init(void) - { - sem_init_ns(&init_ipc_ns); -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; - ipc_init_proc_interface("sysvipc/sem", - " key semid perms nsems uid gid cuid cgid otime ctime\n", - IPC_SEM_IDS, sysvipc_sem_proc_show); ---- a/ipc/shm.c -+++ b/ipc/shm.c -@@ -154,6 +154,8 @@ pure_initcall(ipc_ns_init); - - void __init shm_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; - ipc_init_proc_interface("sysvipc/shm", - #if BITS_PER_LONG <= 32 - " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", ---- a/ipc/util.c -+++ b/ipc/util.c -@@ -140,6 +140,9 @@ void __init ipc_init_proc_interface(cons - struct proc_dir_entry *pde; - struct ipc_proc_iface *iface; - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; -+ - iface = kmalloc(sizeof(*iface), GFP_KERNEL); - if (!iface) - return; ---- a/kernel/exec_domain.c -+++ b/kernel/exec_domain.c -@@ -29,6 +29,8 @@ static int execdomains_proc_show(struct - - static int __init proc_execdomains_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; - proc_create_single("execdomains", 0, NULL, execdomains_proc_show); - return 0; - } ---- a/kernel/irq/proc.c -+++ b/kernel/irq/proc.c -@@ -341,6 +341,9 @@ void register_irq_proc(unsigned int irq, - void __maybe_unused *irqp = (void *)(unsigned long) irq; - char name [MAX_NAMELEN]; - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED) && !IS_ENABLED(CONFIG_SMP)) -+ return; -+ - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) - return; - -@@ -394,6 +397,9 @@ void unregister_irq_proc(unsigned int ir - { - char name [MAX_NAMELEN]; - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED) && !IS_ENABLED(CONFIG_SMP)) -+ return; -+ - if (!root_irq_dir || !desc->dir) - return; - #ifdef CONFIG_SMP -@@ -432,6 +438,9 @@ void init_irq_proc(void) - unsigned int irq; - struct irq_desc *desc; - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED) && !IS_ENABLED(CONFIG_SMP)) -+ return; -+ - /* create /proc/irq */ - root_irq_dir = proc_mkdir("irq", NULL); - if (!root_irq_dir) ---- a/kernel/time/timer_list.c -+++ b/kernel/time/timer_list.c -@@ -370,6 +370,8 @@ static int __init init_timer_list_procfs - { - struct proc_dir_entry *pe; - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; - pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, - sizeof(struct timer_list_iter), NULL); - if (!pe) ---- a/mm/vmalloc.c -+++ b/mm/vmalloc.c -@@ -3564,6 +3564,8 @@ static const struct seq_operations vmall - - static int __init proc_vmalloc_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; - if (IS_ENABLED(CONFIG_NUMA)) - proc_create_seq_private("vmallocinfo", 0400, NULL, - &vmalloc_op, ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -1988,10 +1988,12 @@ void __init init_mm_internals(void) - start_shepherd_timer(); - #endif - #ifdef CONFIG_PROC_FS -- proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); -- proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) { -+ proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); -+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); -+ proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); -+ } - proc_create_seq("vmstat", 0444, NULL, &vmstat_op); -- proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); - #endif - } - ---- a/net/8021q/vlanproc.c -+++ b/net/8021q/vlanproc.c -@@ -93,6 +93,9 @@ void vlan_proc_cleanup(struct net *net) - { - struct vlan_net *vn = net_generic(net, vlan_net_id); - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return; -+ - if (vn->proc_vlan_conf) - remove_proc_entry(name_conf, vn->proc_vlan_dir); - -@@ -112,6 +115,9 @@ int __net_init vlan_proc_init(struct net - { - struct vlan_net *vn = net_generic(net, vlan_net_id); - -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; -+ - vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net); - if (!vn->proc_vlan_dir) - goto err; ---- a/net/core/net-procfs.c -+++ b/net/core/net-procfs.c -@@ -279,10 +279,12 @@ static int __net_init dev_proc_net_init( - if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops, - sizeof(struct seq_net_private))) - goto out; -- if (!proc_create_seq("softnet_stat", 0444, net->proc_net, -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) && -+ !proc_create_seq("softnet_stat", 0444, net->proc_net, - &softnet_seq_ops)) - goto out_dev; -- if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) && -+ !proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, - sizeof(struct seq_net_private))) - goto out_softnet; - -@@ -292,9 +294,11 @@ static int __net_init dev_proc_net_init( - out: - return rc; - out_ptype: -- remove_proc_entry("ptype", net->proc_net); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ remove_proc_entry("ptype", net->proc_net); - out_softnet: -- remove_proc_entry("softnet_stat", net->proc_net); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ remove_proc_entry("softnet_stat", net->proc_net); - out_dev: - remove_proc_entry("dev", net->proc_net); - goto out; -@@ -304,8 +308,10 @@ static void __net_exit dev_proc_net_exit - { - wext_proc_exit(net); - -- remove_proc_entry("ptype", net->proc_net); -- remove_proc_entry("softnet_stat", net->proc_net); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) { -+ remove_proc_entry("ptype", net->proc_net); -+ remove_proc_entry("softnet_stat", net->proc_net); -+ } - remove_proc_entry("dev", net->proc_net); - } - ---- a/net/core/sock.c -+++ b/net/core/sock.c -@@ -3643,6 +3643,8 @@ static __net_initdata struct pernet_oper - - static int __init proto_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; - return register_pernet_subsys(&proto_net_ops); - } - ---- a/net/ipv4/fib_trie.c -+++ b/net/ipv4/fib_trie.c -@@ -2848,11 +2848,13 @@ static const struct seq_operations fib_r - - int __net_init fib_proc_init(struct net *net) - { -- if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) && -+ !proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, - sizeof(struct fib_trie_iter))) - goto out1; - -- if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) && -+ !proc_create_net_single("fib_triestat", 0444, net->proc_net, - fib_triestat_seq_show, NULL)) - goto out2; - -@@ -2863,17 +2865,21 @@ int __net_init fib_proc_init(struct net - return 0; - - out3: -- remove_proc_entry("fib_triestat", net->proc_net); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ remove_proc_entry("fib_triestat", net->proc_net); - out2: -- remove_proc_entry("fib_trie", net->proc_net); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ remove_proc_entry("fib_trie", net->proc_net); - out1: - return -ENOMEM; - } - - void __net_exit fib_proc_exit(struct net *net) - { -- remove_proc_entry("fib_trie", net->proc_net); -- remove_proc_entry("fib_triestat", net->proc_net); -+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) { -+ remove_proc_entry("fib_trie", net->proc_net); -+ remove_proc_entry("fib_triestat", net->proc_net); -+ } - remove_proc_entry("route", net->proc_net); - } - ---- a/net/ipv4/proc.c -+++ b/net/ipv4/proc.c -@@ -522,5 +522,8 @@ static __net_initdata struct pernet_oper - - int __init ip_misc_proc_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; -+ - return register_pernet_subsys(&ip_proc_ops); - } ---- a/net/ipv4/route.c -+++ b/net/ipv4/route.c -@@ -410,6 +410,9 @@ static struct pernet_operations ip_rt_pr - - static int __init ip_rt_proc_init(void) - { -+ if (IS_ENABLED(CONFIG_PROC_STRIPPED)) -+ return 0; -+ - return register_pernet_subsys(&ip_rt_proc_ops); - } - diff --git a/root/target/linux/generic/hack-5.4/904-debloat_dma_buf.patch b/root/target/linux/generic/hack-5.4/904-debloat_dma_buf.patch deleted file mode 100755 index 76032d9b..00000000 --- a/root/target/linux/generic/hack-5.4/904-debloat_dma_buf.patch +++ /dev/null @@ -1,74 +0,0 @@ -From e3692cb2fcd5ba1244512a0f43b8118f65f1c375 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 8 Jul 2017 08:20:43 +0200 -Subject: debloat: dmabuf - -Signed-off-by: Felix Fietkau ---- - drivers/base/Kconfig | 2 +- - drivers/dma-buf/Makefile | 10 +++++++--- - drivers/dma-buf/dma-buf.c | 4 +++- - kernel/sched/core.c | 1 + - 4 files changed, 12 insertions(+), 5 deletions(-) - ---- a/drivers/base/Kconfig -+++ b/drivers/base/Kconfig -@@ -179,7 +179,7 @@ config SOC_BUS - source "drivers/base/regmap/Kconfig" - - config DMA_SHARED_BUFFER -- bool -+ tristate - default n - select IRQ_WORK - help ---- a/drivers/dma-buf/Makefile -+++ b/drivers/dma-buf/Makefile -@@ -1,9 +1,13 @@ - # SPDX-License-Identifier: GPL-2.0-only --obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ -- dma-resv.o seqno-fence.o --obj-$(CONFIG_SYNC_FILE) += sync_file.o --obj-$(CONFIG_SW_SYNC) += sw_sync.o sync_debug.o --obj-$(CONFIG_UDMABUF) += udmabuf.o -+obj-$(CONFIG_DMA_SHARED_BUFFER) := dma-shared-buffer.o -+ -+dma-buf-objs-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ -+ dma-resv.o seqno-fence.o -+dma-buf-objs-$(CONFIG_SYNC_FILE) += sync_file.o -+dma-buf-objs-$(CONFIG_SW_SYNC) += sw_sync.o sync_debug.o -+dma-buf-objs-$(CONFIG_UDMABUF) += udmabuf.o -+ -+dma-shared-buffer-objs := $(dma-buf-objs-y) - - dmabuf_selftests-y := \ - selftest.o \ ---- a/drivers/dma-buf/dma-buf.c -+++ b/drivers/dma-buf/dma-buf.c -@@ -1314,4 +1314,5 @@ static void __exit dma_buf_deinit(void) - dma_buf_uninit_debugfs(); - kern_unmount(dma_buf_mnt); - } --__exitcall(dma_buf_deinit); -+module_exit(dma_buf_deinit); -+MODULE_LICENSE("GPL"); ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -2770,6 +2770,7 @@ int wake_up_state(struct task_struct *p, - { - return try_to_wake_up(p, state, 0); - } -+EXPORT_SYMBOL_GPL(wake_up_state); - - /* - * Perform scheduler related setup for a newly forked process p. ---- a/fs/d_path.c -+++ b/fs/d_path.c -@@ -311,6 +311,7 @@ char *dynamic_dname(struct dentry *dentr - buffer += buflen - sz; - return memcpy(buffer, temp, sz); - } -+EXPORT_SYMBOL_GPL(dynamic_dname); - - char *simple_dname(struct dentry *dentry, char *buffer, int buflen) - { diff --git a/root/target/linux/generic/hack-5.4/910-kobject_uevent.patch b/root/target/linux/generic/hack-5.4/910-kobject_uevent.patch deleted file mode 100755 index c4c41ca4..00000000 --- a/root/target/linux/generic/hack-5.4/910-kobject_uevent.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 0d37e6edc09c99e683dd91ca0e83bbc0df8477b3 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sun, 16 Jul 2017 16:56:10 +0200 -Subject: lib: add uevent_next_seqnum() - -Signed-off-by: Felix Fietkau ---- - include/linux/kobject.h | 5 +++++ - lib/kobject_uevent.c | 37 +++++++++++++++++++++++++++++++++++++ - 2 files changed, 42 insertions(+) - ---- a/lib/kobject_uevent.c -+++ b/lib/kobject_uevent.c -@@ -179,6 +179,18 @@ out: - return r; - } - -+u64 uevent_next_seqnum(void) -+{ -+ u64 seq; -+ -+ mutex_lock(&uevent_sock_mutex); -+ seq = ++uevent_seqnum; -+ mutex_unlock(&uevent_sock_mutex); -+ -+ return seq; -+} -+EXPORT_SYMBOL_GPL(uevent_next_seqnum); -+ - /** - * kobject_synth_uevent - send synthetic uevent with arguments - * diff --git a/root/target/linux/generic/hack-5.4/911-kobject_add_broadcast_uevent.patch b/root/target/linux/generic/hack-5.4/911-kobject_add_broadcast_uevent.patch deleted file mode 100755 index 6f5e50d0..00000000 --- a/root/target/linux/generic/hack-5.4/911-kobject_add_broadcast_uevent.patch +++ /dev/null @@ -1,76 +0,0 @@ -From 0d37e6edc09c99e683dd91ca0e83bbc0df8477b3 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sun, 16 Jul 2017 16:56:10 +0200 -Subject: lib: add uevent_next_seqnum() - -Signed-off-by: Felix Fietkau ---- - include/linux/kobject.h | 5 +++++ - lib/kobject_uevent.c | 37 +++++++++++++++++++++++++++++++++++++ - 2 files changed, 42 insertions(+) - ---- a/include/linux/kobject.h -+++ b/include/linux/kobject.h -@@ -32,6 +32,8 @@ - #define UEVENT_NUM_ENVP 32 /* number of env pointers */ - #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ - -+struct sk_buff; -+ - #ifdef CONFIG_UEVENT_HELPER - /* path to the userspace helper executed on an event */ - extern char uevent_helper[]; -@@ -245,4 +247,7 @@ int kobject_synth_uevent(struct kobject - __printf(2, 3) - int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); - -+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, -+ gfp_t allocation); -+ - #endif /* _KOBJECT_H_ */ ---- a/lib/kobject_uevent.c -+++ b/lib/kobject_uevent.c -@@ -691,6 +691,43 @@ int add_uevent_var(struct kobj_uevent_en - EXPORT_SYMBOL_GPL(add_uevent_var); - - #if defined(CONFIG_NET) -+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, -+ gfp_t allocation) -+{ -+ struct uevent_sock *ue_sk; -+ int err = 0; -+ -+ /* send netlink message */ -+ mutex_lock(&uevent_sock_mutex); -+ list_for_each_entry(ue_sk, &uevent_sock_list, list) { -+ struct sock *uevent_sock = ue_sk->sk; -+ struct sk_buff *skb2; -+ -+ skb2 = skb_clone(skb, allocation); -+ if (!skb2) -+ break; -+ -+ err = netlink_broadcast(uevent_sock, skb2, pid, group, -+ allocation); -+ if (err) -+ break; -+ } -+ mutex_unlock(&uevent_sock_mutex); -+ -+ kfree_skb(skb); -+ return err; -+} -+#else -+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, -+ gfp_t allocation) -+{ -+ kfree_skb(skb); -+ return 0; -+} -+#endif -+EXPORT_SYMBOL_GPL(broadcast_uevent); -+ -+#if defined(CONFIG_NET) - static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb, - struct netlink_ext_ack *extack) - { diff --git a/root/target/linux/generic/hack-5.4/921-always-create-console-node-in-initramfs.patch b/root/target/linux/generic/hack-5.4/921-always-create-console-node-in-initramfs.patch deleted file mode 100755 index e4375790..00000000 --- a/root/target/linux/generic/hack-5.4/921-always-create-console-node-in-initramfs.patch +++ /dev/null @@ -1,40 +0,0 @@ -From 5d301596fdc72f6cb672f72eb3c66e7cddefb103 Mon Sep 17 00:00:00 2001 -From: Felix Fietkau -Date: Sat, 8 Jul 2017 08:26:02 +0200 -Subject: initramfs: always create console node - -Signed-off-by: Felix Fietkau ---- - usr/gen_initramfs_list.sh | 14 ++++++++++++++ - 1 file changed, 14 insertions(+) - ---- a/usr/gen_initramfs_list.sh -+++ b/usr/gen_initramfs_list.sh -@@ -59,6 +59,18 @@ default_initramfs() { - EOF - } - -+list_openwrt_initramfs() { -+ : -+} -+ -+openwrt_initramfs() { -+ # make sure that /dev/console exists -+ cat <<-EOF >> ${output} -+ dir /dev 0755 0 0 -+ nod /dev/console 0600 0 0 c 5 1 -+ EOF -+} -+ - filetype() { - local argv1="$1" - -@@ -180,6 +192,8 @@ dir_filelist() { - if [ "$(echo "${dirlist}" | wc -l)" -gt 1 ]; then - ${dep_list}print_mtime "$1" - -+ ${dep_list}openwrt_initramfs -+ - echo "${dirlist}" | \ - while read x; do - ${dep_list}parse ${x} diff --git a/root/target/linux/generic/pending-5.15/681-NET-add-mtd-mac-address-support-to-of_get_mac_addres.patch b/root/target/linux/generic/pending-5.15/681-NET-add-mtd-mac-address-support-to-of_get_mac_addres.patch deleted file mode 100755 index 1fe0d915..00000000 --- a/root/target/linux/generic/pending-5.15/681-NET-add-mtd-mac-address-support-to-of_get_mac_addres.patch +++ /dev/null @@ -1,102 +0,0 @@ -From 6f8e5369ae054ec6c9265581d5a7e39738a5cd84 Mon Sep 17 00:00:00 2001 -From: Ansuel Smith -Date: Tue, 30 Mar 2021 13:16:38 +0200 -Subject: [PATCH 1/2] NET: add mtd-mac-address support to of_get_mac_address() - -Many embedded devices have information such as mac addresses stored inside mtd -devices. This patch allows us to add a property inside a node describing a -network interface. The new property points at a mtd partition with an offset -where the mac address can be found. - -Signed-off-by: John Crispin -Signed-off-by: Felix Fietkau -Signed-off-by: Ansuel Smith ---- - drivers/of/of_net.c | 75 ++++++++++++++++++++++++++++++++++++++++++++- - 1 file changed, 74 insertions(+), 1 deletion(-) - ---- a/drivers/of/of_net.c -+++ b/drivers/of/of_net.c -@@ -12,6 +12,7 @@ - #include - #include - #include -+#include - - /** - * of_get_phy_mode - Get phy mode for given device_node -@@ -95,6 +96,52 @@ static int of_get_mac_addr_nvmem(struct - return 0; - } - -+static int of_get_mac_address_mtd(struct device_node *np, u8 *addr) -+{ -+#ifdef CONFIG_MTD -+ struct platform_device *pdev = of_find_device_by_node(np); -+ struct device_node *mtd_np = NULL; -+ size_t retlen; -+ int size, ret; -+ struct mtd_info *mtd; -+ const char *part; -+ const __be32 *list; -+ phandle phandle; -+ u8 mac[ETH_ALEN]; -+ -+ list = of_get_property(np, "mtd-mac-address", &size); -+ if (!list || (size != (2 * sizeof(*list)))) -+ return -ENODEV; -+ -+ phandle = be32_to_cpup(list++); -+ if (phandle) -+ mtd_np = of_find_node_by_phandle(phandle); -+ -+ if (!mtd_np) -+ return -ENODEV; -+ -+ part = of_get_property(mtd_np, "label", NULL); -+ if (!part) -+ part = mtd_np->name; -+ -+ mtd = get_mtd_device_nm(part); -+ if (IS_ERR(mtd)) -+ return -ENODEV; -+ -+ ret = mtd_read(mtd, be32_to_cpup(list), 6, &retlen, mac); -+ put_mtd_device(mtd); -+ -+ if (!is_valid_ether_addr(mac)) -+ return -EINVAL; -+ -+ memcpy(addr, mac, ETH_ALEN); -+ -+ return 0; -+#endif -+ return -EINVAL; -+} -+ -+ - /** - * of_get_mac_address() - * @np: Caller's Device Node -@@ -119,6 +166,10 @@ static int of_get_mac_addr_nvmem(struct - * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists - * but is all zeros. - * -+ * -+ * If a mtd-mac-address property exists, try to fetch the MAC address from the -+ * specified mtd device. -+ * - * Return: 0 on success and errno in case of error. - */ - int of_get_mac_address(struct device_node *np, u8 *addr) -@@ -140,6 +191,10 @@ int of_get_mac_address(struct device_nod - if (!ret) - return 0; - -+ ret = of_get_mac_address_mtd(np, addr); -+ if (!ret) -+ return 0; -+ - return of_get_mac_addr_nvmem(np, addr); - } - EXPORT_SYMBOL(of_get_mac_address); diff --git a/root/target/linux/generic/pending-5.15/735-net-phy-at803x-fix-at8033-sgmii-mode.patch b/root/target/linux/generic/pending-5.15/735-net-phy-at803x-fix-at8033-sgmii-mode.patch deleted file mode 100755 index 33a994a9..00000000 --- a/root/target/linux/generic/pending-5.15/735-net-phy-at803x-fix-at8033-sgmii-mode.patch +++ /dev/null @@ -1,51 +0,0 @@ -From: Roman Yeryomin -Subject: kernel: add at803x fix for sgmii mode - -Some (possibly broken) bootloaders incorreclty initialize at8033 -phy. This patch enables sgmii autonegotiation mode. - -[john@phrozen.org: felix added this to his upstream queue] - -Signed-off-by: Roman Yeryomin ---- - drivers/net/phy/at803x.c | 25 +++++++++++++++++++++++++ - 1 file changed, 25 insertions(+) - ---- a/drivers/net/phy/at803x.c -+++ b/drivers/net/phy/at803x.c -@@ -76,6 +76,7 @@ - #define AT803X_LOC_MAC_ADDR_32_47_OFFSET 0x804A - #define AT803X_REG_CHIP_CONFIG 0x1f - #define AT803X_BT_BX_REG_SEL 0x8000 -+#define AT803X_SGMII_ANEG_EN 0x1000 - - #define AT803X_DEBUG_ADDR 0x1D - #define AT803X_DEBUG_DATA 0x1E -@@ -790,6 +791,27 @@ static int at8031_pll_config(struct phy_ - static int at803x_config_init(struct phy_device *phydev) - { - int ret; -+ u32 v; -+ -+ if (phydev->drv->phy_id == ATH8031_PHY_ID && -+ phydev->interface == PHY_INTERFACE_MODE_SGMII) -+ { -+ v = phy_read(phydev, AT803X_REG_CHIP_CONFIG); -+ /* select SGMII/fiber page */ -+ ret = phy_write(phydev, AT803X_REG_CHIP_CONFIG, -+ v & ~AT803X_BT_BX_REG_SEL); -+ if (ret) -+ return ret; -+ /* enable SGMII autonegotiation */ -+ ret = phy_write(phydev, MII_BMCR, AT803X_SGMII_ANEG_EN); -+ if (ret) -+ return ret; -+ /* select copper page */ -+ ret = phy_write(phydev, AT803X_REG_CHIP_CONFIG, -+ v | AT803X_BT_BX_REG_SEL); -+ if (ret) -+ return ret; -+ } - - /* The RX and TX delay default is: - * after HW reset: RX delay enabled and TX delay disabled diff --git a/root/target/linux/generic/pending-5.15/761-net-dsa-mt7530-Support-EEE-features.patch b/root/target/linux/generic/pending-5.15/761-net-dsa-mt7530-Support-EEE-features.patch deleted file mode 100755 index 405f87ff..00000000 --- a/root/target/linux/generic/pending-5.15/761-net-dsa-mt7530-Support-EEE-features.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 9cfb2d426c38272f245e9e6f62b3552d1ed5852b Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Ren=C3=A9=20van=20Dorst?= -Date: Tue, 21 Apr 2020 00:18:08 +0200 -Subject: [PATCH] net: dsa: mt7530: Support EEE features -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Signed-off-by: RenĂ© van Dorst ---- a/drivers/net/dsa/mt7530.c -+++ b/drivers/net/dsa/mt7530.c -@@ -2752,9 +2752,13 @@ static void mt753x_phylink_mac_link_up(s - switch (speed) { - case SPEED_1000: - mcr |= PMCR_FORCE_SPEED_1000; -+ if (priv->eee_enable & BIT(port)) -+ mcr |= PMCR_FORCE_EEE1G; - break; - case SPEED_100: - mcr |= PMCR_FORCE_SPEED_100; -+ if (priv->eee_enable & BIT(port)) -+ mcr |= PMCR_FORCE_EEE100; - break; - } - if (duplex == DUPLEX_FULL) { -@@ -3031,6 +3035,54 @@ static int mt753x_set_mac_eee(struct dsa - - return 0; - } -+ -+static int mt7530_get_mac_eee(struct dsa_switch *ds, int port, -+ struct ethtool_eee *e) -+{ -+ struct mt7530_priv *priv = ds->priv; -+ u32 eeecr, pmsr; -+ -+ e->eee_enabled = !!(priv->eee_enable & BIT(port)); -+ -+ if (e->eee_enabled) { -+ eeecr = mt7530_read(priv, MT7530_PMEEECR_P(port)); -+ e->tx_lpi_enabled = !(eeecr & LPI_MODE_EN); -+ e->tx_lpi_timer = (eeecr >> 4) & 0xFFF; -+ pmsr = mt7530_read(priv, MT7530_PMSR_P(port)); -+ e->eee_active = e->eee_enabled && !!(pmsr & PMSR_EEE1G); -+ } else { -+ e->tx_lpi_enabled = 0; -+ e->tx_lpi_timer = 0; -+ e->eee_active = 0; -+ } -+ -+ return 0; -+} -+ -+static int mt7530_set_mac_eee(struct dsa_switch *ds, int port, -+ struct ethtool_eee *e) -+{ -+ struct mt7530_priv *priv = ds->priv; -+ u32 eeecr; -+ -+ if (e->tx_lpi_enabled && e->tx_lpi_timer > 0xFFF) -+ return -EINVAL; -+ -+ if (e->eee_enabled) { -+ priv->eee_enable |= BIT(port); -+ //MT7530_PMEEECR_P -+ eeecr = mt7530_read(priv, MT7530_PMEEECR_P(port)); -+ eeecr &= 0xFFFF0000; -+ if (!e->tx_lpi_enabled) -+ eeecr |= LPI_MODE_EN; -+ eeecr = LPI_THRESH(e->tx_lpi_timer); -+ mt7530_write(priv, MT7530_PMEEECR_P(port), eeecr); -+ } else { -+ priv->eee_enable &= ~(BIT(port)); -+ } -+ -+ return 0; -+} - - static const struct dsa_switch_ops mt7530_switch_ops = { - .get_tag_protocol = mtk_get_tag_protocol, ---- a/drivers/net/dsa/mt7530.h -+++ b/drivers/net/dsa/mt7530.h -@@ -329,6 +329,12 @@ enum mt7530_vlan_port_attr { - #define MAX_RX_PKT_LEN_1552 0x2 - #define MAX_RX_PKT_LEN_JUMBO 0x3 - -+#define MT7530_PMEEECR_P(x) (0x3004 + (x) * 0x100) -+#define WAKEUP_TIME_1000(x) ((x & 0xFF) << 24) -+#define WAKEUP_TIME_100(x) ((x & 0xFF) << 16) -+#define LPI_THRESH(x) ((x & 0xFFF) << 4) -+#define LPI_MODE_EN BIT(0) -+ - /* Register for MIB */ - #define MT7530_PORT_MIB_COUNTER(x) (0x4000 + (x) * 0x100) - #define MT7530_MIB_CCR 0x4fe0 -@@ -804,6 +810,7 @@ struct mt7530_priv { - unsigned int p5_intf_sel; - u8 mirror_rx; - u8 mirror_tx; -+ u8 eee_enable; - - struct mt7530_port ports[MT7530_NUM_PORTS]; - /* protect among processes for registers access*/