diff --git a/shortcut-fe/.gitignore b/shortcut-fe/.gitignore deleted file mode 100644 index 958088547..000000000 --- a/shortcut-fe/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ouptut files - -*.o -*.s - diff --git a/shortcut-fe/Makefile b/shortcut-fe/Makefile deleted file mode 100644 index 27e41299b..000000000 --- a/shortcut-fe/Makefile +++ /dev/null @@ -1,42 +0,0 @@ -# -# Makefile for Shortcut FE. -# - -KERNELVERSION := $(word 1, $(subst ., ,$(KERNELVERSION))).$(word 2, $(subst ., ,$(KERNELVERSION))) - -SFE_BASE_OBJS := sfe.o sfe_init.o -SFE_IPV4_OBJS := sfe_ipv4.o sfe_ipv4_udp.o sfe_ipv4_tcp.o sfe_ipv4_icmp.o sfe_ipv4_tun6rd.o sfe_ipv4_pppoe_br.o sfe_ipv4_esp.o -SFE_IPV6_OBJS := sfe_ipv6.o sfe_ipv6_udp.o sfe_ipv6_tcp.o sfe_ipv6_icmp.o sfe_ipv6_tunipip6.o sfe_ipv6_pppoe_br.o sfe_ipv6_esp.o -SFE_PPPOE_OBJS := sfe_pppoe.o sfe_pppoe_mgr.o - - -ifeq ($(findstring 4.4, $(KERNELVERSION)),) - SFE_IPV4_OBJS += sfe_ipv4_gre.o - SFE_IPV6_OBJS += sfe_ipv6_gre.o - ccflags-y += -DSFE_GRE_TUN_ENABLE -endif - - -obj-m += qca-nss-sfe.o - -# -# Base files -# -qca-nss-sfe-objs := $(SFE_BASE_OBJS) - -# -# IPv4 files -# -qca-nss-sfe-objs += $(SFE_IPV4_OBJS) -qca-nss-sfe-objs += $(SFE_PPPOE_OBJS) - -ifdef SFE_SUPPORT_IPV6 -qca-nss-sfe-objs += $(SFE_IPV6_OBJS) -ccflags-y += -DSFE_SUPPORT_IPV6 -endif - -ifdef SFE_PROCESS_LOCAL_OUT -ccflags-y += -DSFE_PROCESS_LOCAL_OUT -endif - -ccflags-y += -Werror -Wall -Iexports/ diff --git a/shortcut-fe/README b/shortcut-fe/README deleted file mode 100644 index 1bf1cc255..000000000 --- a/shortcut-fe/README +++ /dev/null @@ -1,122 +0,0 @@ -Shortcut Forwarding Engine --------------------------- - -Welcome to "Shortcut" :-) - -Here's a quick FAQ: - - -Q) What is Shortcut? - -A) Shortcut is an in-Linux-kernel IP packet forwarding engine. It's designed -to offer very high speed IP packet forwarding based on IP connection tracking. -It's dramatically faster than the standard netfilter-based NAT forwarding path -but is designed to synchronise state back to netfilter/conntrack so that it -doesn't need to deal with all of the complexities of special cases. - - -Q) What versions of IP does it support? - -A) The current version only supports IPv4 but will be extended to support IPv6 in -the future. - - -Q) What transport protocols does it support? - -A) TCP and UDP. It also knows enough about ICMP to spot ICMP error messages -related to TCP and UDP and handle things accordingly. - - -Q) Is there a design spec for this software? - -A) Not at the moment. I'll write one when I get more time. The code is -intended to be a good tutorial though - it's very heavily commented. If you -find yourself reading something and not understanding it then I take that to -mean I've probably not done a sufficently good job of explaining what it's -doing in the comments. Let me know - I will try to fix it :-) - - -Q) Why was it written? - -A) It was written as a demonstration of what can be done to provide high -performance forwarding inside the kernel. There were two initial motivations: - -1) To provide a platform to enable research into how QoS analysis systems can -offload work and avoid huge Linux overheads. - -2) To provide a tool to investigate the behaviour of various processors, SoCs -and software sets so that we can characterize and design new network processor -SoCs. - - -Q) How much faster is it than the Linux kernel forwarding path? - -A) At the time of pushing this to github it's been tested on a QCA AP135. -This has a Scorpion (QCA Scopion, not the QMC one :-)) SoC, QCA9550. The -SoC's processor is a MIPS74K running at 720 MHz and with a DDR2 memory -subsystem that offers a peak of 600 MT/s (16-bit transfers). - -Running IPv4 NAT forwarding of UDP between the board's 2 GMAC ports and -using a SmartBits 200 as a traffic generator Linux is able to forward 70k PPS. -Once the SFE code is invoked this will increase to 350k PPS! - -There's also a slightly hacky mode which causes SFE to bypass the Linux -bridge layer, but this isn't really ready for use because it doesn't have -sufficient MAC address checks or integration of statistics back to the -Ethernet bridge, but that runs at 436k PPS. - - -Q) Are there any diagnostics? - -A) Yes, this is a research tool after all! There's a complex way to do this -that's more general purpose and a simple one - here's the simple one: - - mknod /dev/sfe c 253 0 - -The file /dev/sfe is an XML-ish output and provides details of all the -network connections currently being offloaded. It also reports the numbers -of packets that took various "exception" paths within the code. In addition -it provides a summary of the number of connections, attempts to accelerate -connections, cancel accelerations, etc. It also reports the numbers of -packets that were forwarded and not forwarded by the engine and has some -stats on the effectiveness of the hashing algorithm it uses. - - -Q) How does the code interact with Linux? - -A) There are four minor patches required to make this software run with -Linux. These are currently against a 3.3.8 or 3.4.0 kernel: - -* (net/core/dev.c) adds a hook to allow packets to be extracted out. - -* (net/netfilter/nf_conntrack_proto_tcp.c) exposes a state variable inside - netfilter that's necessary to enable TCP sequence and ACK checking within - the offload path. Note that this specific patch is against the QCA QSDK - patched version of 3.3.8 - there's a slightly braindead "performance" - patch in that kernel, courtesy of the OpenWrt community that makes the - Linux forwarding path slightly faster at the expense of losing - functionality :-( - -* (net/Kconfig) adds the shortcut-fe option. - -* (net/Makefile) adds the shortcut-fe build support. - -Once these are applied and the module is loaded then everything else -is automatic :-) The patches are in this git repo. - - -Q) Are any of the pieces reused from other projects? - -A) Yes! Some of the forwarding concepts are reused from the Ubicom Network -Accelerator that morphed into part of the Akronite NSS. This code has all -been substantially changed though to accomodate Linux's needs. - -There are also some pieces that I borrowed from the QCA "FastNAT" software -written by Xiaoping Fan . Xiaoping's code was the -first actual demonstration within QCA that this in-kernel concept could yield -signficant performance gains. - - -Enjoy! -Dave Hudson - diff --git a/shortcut-fe/exports/sfe_api.h b/shortcut-fe/exports/sfe_api.h deleted file mode 100644 index 50a77330c..000000000 --- a/shortcut-fe/exports/sfe_api.h +++ /dev/null @@ -1,832 +0,0 @@ -/* - * sfe_api.h - * SFE exported function headers for SFE engine. - * - * Copyright (c) 2015,2016, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - - -/** - * @file sfe_api.h - * SFE exported function headers for the SFE engine. - */ - -#ifndef __SFE_API_H -#define __SFE_API_H - -/** - * @addtogroup nss_sfe_definitions - * @{ - */ - -#define SFE_MAX_VLAN_DEPTH 2 /**< Maximum VLAN depth. */ -#define SFE_VLAN_ID_NOT_CONFIGURED 0xfff /**< VLAN ID not configured. */ -#define SFE_INVALID_VLAN_PCP 0xff /**< VLAN PCP remark is invalid for SAWF (Service Aware Wi-Fi). */ -#define SFE_MAX_SERVICE_CLASS_ID 0x80 /**< Maximum service class ID. */ -#define SFE_INVALID_SERVICE_CLASS_ID 0xff /**< Service class ID not valid. */ -#define SFE_SERVICE_CLASS_STATS_MAX_RETRY 100 /**< Maximum retries for fetching service class statistics. */ -#define SFE_INVALID_MSDUQ 0xff /**< Invalid MAC Service Data Unit Queue. */ - -#define SFE_SPECIAL_INTERFACE_BASE 0x7f00 /**< Special interface base number. */ -#define SFE_SPECIAL_INTERFACE_IPV4 (SFE_SPECIAL_INTERFACE_BASE + 1) /**< Interface number for IPv4. */ -#define SFE_SPECIAL_INTERFACE_IPV6 (SFE_SPECIAL_INTERFACE_BASE + 2) /**< Interface enumber fo IPv6. */ -#define SFE_SPECIAL_INTERFACE_IPSEC (SFE_SPECIAL_INTERFACE_BASE + 3) /**< Interface number for IPSec. */ -#define SFE_SPECIAL_INTERFACE_L2TP (SFE_SPECIAL_INTERFACE_BASE + 4) /**< Interface number for L2TP. */ -#define SFE_SPECIAL_INTERFACE_PPTP (SFE_SPECIAL_INTERFACE_BASE + 5) /**< Interface number for PPTP. */ - -/** -* @} -*/ - -/** - * @addtogroup nss_sfe_flags - * @{ - */ - -/* - * Rule creation and rule update flags. - */ -#define SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK (1<<0) /**< Do not perform TCP sequence number checks. */ -#define SFE_RULE_CREATE_FLAG_BRIDGE_FLOW (1<<1) /**< Rule is for a pure bridge forwarding flow. */ -#define SFE_RULE_CREATE_FLAG_ROUTED (1<<2) /**< Rule is for a routed connection. */ -#define SFE_RULE_CREATE_FLAG_DSCP_MARKING (1<<3) /**< Rule has DSCP marking configured. */ -#define SFE_RULE_CREATE_FLAG_VLAN_MARKING (1<<4) /**< Rule has VLAN marking configured. */ -#define SFE_RULE_UPDATE_FLAG_CHANGE_MTU (1<<5) /**< Update MTU of connection interfaces. */ -#define SFE_RULE_CREATE_FLAG_ICMP_NO_CME_FLUSH (1<<6) /**< Rule to not flush CME on ICMP packets. */ -#define SFE_RULE_CREATE_FLAG_L2_ENCAP (1<<7) /**< Consists of an encapsulating protocol that carries an IPv4 payload within it. */ -#define SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE (1<<8) /**< Use flow interface number instead of top interface. */ -#define SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE (1<<9) /**< Use return interface number instead of top interface. */ -#define SFE_RULE_CREATE_FLAG_FLOW_SRC_INTERFACE_CHECK (1<<10) /**< Check source interface on the flow direction. */ -#define SFE_RULE_CREATE_FLAG_RETURN_SRC_INTERFACE_CHECK (1<<11) - /**< Check source interface on the return direction. */ -#define SFE_RULE_CREATE_FLAG_FLOW_TRANSMIT_FAST (1<<12) /**< Original flow can be transmitted fast. */ -#define SFE_RULE_CREATE_FLAG_RETURN_TRANSMIT_FAST (1<<13) /**< Return flow can be transmitted fast. */ -#define SFE_RULE_CREATE_FLAG_FLOW_SRC_INTERFACE_CHECK_NO_FLUSH (1<<14) - /**< Check source interface on the flow direction but do not flush the connection. */ -#define SFE_RULE_CREATE_FLAG_RETURN_SRC_INTERFACE_CHECK_NO_FLUSH (1<<15) - /**< Check source interface on the return direction but do not flush the connection. */ - -/* - * Rule creation validity flags. - */ -#define SFE_RULE_CREATE_CONN_VALID (1<<0) /**< IPv4 connection is valid. */ -#define SFE_RULE_CREATE_TCP_VALID (1<<1) /**< TCP protocol fields are valid. */ -#define SFE_RULE_CREATE_PPPOE_DECAP_VALID (1<<2) /**< PPPoE decapsulation fields are valid. */ -#define SFE_RULE_CREATE_PPPOE_ENCAP_VALID (1<<3) /**< PPPoE encapsulation fields are valid. */ -#define SFE_RULE_CREATE_QOS_VALID (1<<4) /**< QoS fields are valid. */ -#define SFE_RULE_CREATE_VLAN_VALID (1<<5) /**< VLAN fields are valid. */ -#define SFE_RULE_CREATE_DSCP_MARKING_VALID (1<<6) /**< DSCP marking fields are valid. */ -#define SFE_RULE_CREATE_VLAN_MARKING_VALID (1<<7) /**< VLAN marking fields are valid. */ -#define SFE_RULE_CREATE_DIRECTION_VALID (1<<8) /**< Acceleration direction is valid. */ -#define SFE_RULE_CREATE_SRC_MAC_VALID (1<<9) /**< Source MAC address is valid. */ -#define SFE_RULE_CREATE_MARK_VALID (1<<10) /**< SKB marking fields are valid. */ - -/* - * Source MAC address validity flags; used with the mac_valid_flags field in the sfe_ipv4_src_mac_rule structure. - */ -#define SFE_SRC_MAC_FLOW_VALID 0x01 - /**< MAC address for the flow interface is valid. */ -#define SFE_SRC_MAC_RETURN_VALID 0x02 - /**< MAC address for the return interface is valid. */ - -/** - * @} - */ - -/** - * @addtogroup nss_sfe_datatypes - * @{ - */ - -/* - * 32/64-bit pointer types. - */ -#ifdef __LP64__ -typedef uint64_t sfe_ptr_t; /**< 64-bit pointer. */ -#else -typedef uint32_t sfe_ptr_t; /**< 32-bit pointer. */ -#endif - -/** -* Synchronize reason enum. -*/ -typedef enum /** @cond */ sfe_rule_sync_reason /** @endcond */ { - SFE_RULE_SYNC_REASON_STATS, /**< Synchronize statistics. */ - SFE_RULE_SYNC_REASON_FLUSH, /**< Synchronize to flush an entry. */ - SFE_RULE_SYNC_REASON_EVICT, /**< Synchronize to evict an entry. */ - SFE_RULE_SYNC_REASON_DESTROY /**< Synchronize to destroy an entry (requested by the connection manager). */ - -} sfe_rule_sync_reason_t; - -/** - * Tx command status. - */ -typedef enum { - SFE_TX_SUCCESS = 0, /**< Success. */ - SFE_TX_FAILURE, /**< Failure other than descriptor not available. */ - SFE_TX_FAILURE_QUEUE, /**< failure due to descriptor not available. */ - SFE_TX_FAILURE_NOT_READY, /**< Failure due to SFE state uninitialized. */ - SFE_TX_FAILURE_TOO_LARGE, /**< Command is too large to fit in one message. */ - SFE_TX_FAILURE_TOO_SHORT, /**< Command or packet is shorter than expected. */ - SFE_TX_FAILURE_NOT_SUPPORTED, /**< Command or packet not accepted for forwarding. */ - SFE_TX_FAILURE_BAD_PARAM, /**< Failure due to bad parameters. */ - SFE_TX_FAILURE_NOT_ENABLED, /**< Failure due to SFE not enabled. */ -} sfe_tx_status_t; - -/** - * Common response types. - */ -enum sfe_cmn_response { - SFE_CMN_RESPONSE_ACK, /**< Message acknowledged. */ - SFE_CMN_RESPONSE_EVERSION, /**< Version error. */ - SFE_CMN_RESPONSE_EINTERFACE, /**< Interface error. */ - SFE_CMN_RESPONSE_ELENGTH, /**< Length error. */ - SFE_CMN_RESPONSE_EMSG, /**< Message error. */ - SFE_CMM_RESPONSE_NOTIFY, /**< Message independant of request. */ - SFE_CMN_RESPONSE_LAST /**< Indicates the last item. */ -}; - -/** - * IPv4 bridge/route rule messages. - */ -enum sfe_message_types { - SFE_TX_CREATE_RULE_MSG, /**< IPv4/IPv6 create rule message. */ - SFE_TX_DESTROY_RULE_MSG, /**< IPv4/IPv6 destroy rule message. */ - SFE_RX_CONN_STATS_SYNC_MSG, /**< IPv4/IPv6 connection statistics synchronize message. */ - SFE_TX_CONN_STATS_SYNC_MANY_MSG,/**< IPv4/IPv6 connection statistics synchronize many message. */ - SFE_TUN6RD_ADD_UPDATE_PEER, /**< Add/update peer for 6RD tunnel. */ - SFE_MAX_MSG_TYPES, /**< IPv4/IPv6 message max type number. */ -}; - -/** - * Connection mark structure. - */ -struct sfe_connection_mark { - int protocol; /**< Protocol number. */ - __be32 src_ip[4]; /**< Source IP address. */ - __be32 dest_ip[4]; /**< Destination IP address. */ - __be16 src_port; /**< Source port number. */ - __be16 dest_port; /**< Destination port number. */ - u32 mark; /**< Mark value to be updated. */ -}; - -/** - * Common message structure. - */ -struct sfe_cmn_msg { - u16 version; /**< Version ID for the main message format. */ - u16 interface; /**< Primary key for all messages. */ - enum sfe_cmn_response response; /**< Primary response. */ - u32 type; /**< Decentralized request ID used to match response ID. */ - u32 error; /**< Decentralized specific error message; response == EMSG. */ - sfe_ptr_t cb; /**< Callback pointer. */ - sfe_ptr_t app_data; /**< Application data. */ - u32 len; /**< Length of the message excluding this header. */ -}; - -/** - * Common 5-tuple structure. - */ -struct sfe_ipv4_5tuple { - __be32 flow_ip; /**< Flow IP address. */ - __be32 return_ip; /**< Return IP address. */ - __be16 flow_ident; /**< Flow identifier, e.g., TCP/UDP port. */ - __be16 return_ident; /**< Return identifier, e.g., TCP/UDP port. */ - u8 protocol; /**< Protocol number. */ - u8 reserved[3]; /**< Reserved; padding for alignment. */ -}; - -/** - * IPv4 connection rule structure. - */ -struct sfe_ipv4_connection_rule { - u8 flow_mac[6]; /**< Flow MAC address. */ - u8 return_mac[6]; /**< Return MAC address. */ - s32 flow_interface_num; /**< Flow interface number. */ - s32 return_interface_num; /**< Return interface number. */ - s32 flow_top_interface_num; /**< Top flow interface number. */ - s32 return_top_interface_num; /**< Top return interface number. */ - u32 flow_mtu; /**< Flow interface`s MTU. */ - u32 return_mtu; /**< Return interface`s MTU. */ - __be32 flow_ip_xlate; /**< Translated flow IP address. */ - __be32 return_ip_xlate; /**< Translated return IP address. */ - __be16 flow_ident_xlate; /**< Translated flow identifier, e.g., port. */ - __be16 return_ident_xlate; /**< Translated return identifier, e.g., port. */ -}; - -/** - * TCP connection rule structure. - */ -struct sfe_protocol_tcp_rule { - u32 flow_max_window; /**< Flow direction's largest seen window. */ - u32 return_max_window; /**< Return direction's largest seen window. */ - u32 flow_end; /**< Flow direction's largest seen sequence + segment length. */ - u32 return_end; /**< Return direction's largest seen sequence + segment length. */ - u32 flow_max_end; /**< Flow direction's largest seen ack + max(1, win). */ - u32 return_max_end; /**< Return direction's largest seen ack + max(1, win). */ - u8 flow_window_scale; /**< Flow direction's window scaling factor. */ - u8 return_window_scale; /**< Return direction's window scaling factor. */ - u16 reserved; /**< Reserved; padding for alignment. */ -}; - -/** - * sfe_pppoe_br_accel_mode_t - * PPPoE bridge acceleration modes. - */ -typedef enum { - SFE_PPPOE_BR_ACCEL_MODE_DISABLED, /**< No acceleration */ - SFE_PPPOE_BR_ACCEL_MODE_EN_5T, /**< 5-tuple (src_ip, dest_ip, src_port, dest_port, protocol) acceleration */ - SFE_PPPOE_BR_ACCEL_MODE_EN_3T, /**< 3-tuple (src_ip, dest_ip, pppoe session id) acceleration */ - SFE_PPPOE_BR_ACCEL_MODE_MAX /**< Indicates the last item */ -} __attribute__ ((__packed__)) sfe_pppoe_br_accel_mode_t; - -/** - * PPPoE connection rules structure. - */ -struct sfe_pppoe_rule { - u16 flow_pppoe_session_id; /**< Flow direction`s PPPoE session ID. */ - u8 flow_pppoe_remote_mac[ETH_ALEN]; /**< Flow direction`s PPPoE server MAC address. */ - u16 return_pppoe_session_id; /**< Return direction's PPPoE session ID. */ - u8 return_pppoe_remote_mac[ETH_ALEN]; /**< Return direction's PPPoE server MAC address. */ -}; - -/** - * Information for source MAC address rules. - */ -struct sfe_src_mac_rule { - uint32_t mac_valid_flags; /**< MAC address validity flags. */ - uint16_t flow_src_mac[3]; /**< Source MAC address for the flow direction. */ - uint16_t return_src_mac[3]; /**< Source MAC address for the return direction. */ -}; - -/** - * QoS connection rule structure. - */ -struct sfe_qos_rule { - u32 flow_qos_tag; /**< QoS tag associated with this rule for flow direction. */ - u32 return_qos_tag; /**< QoS tag associated with this rule for return direction. */ -}; - -/** -* Mark rule structure. -*/ -struct sfe_mark_rule { - u32 flow_mark; /**< SKB mark associated with this rule for flow direction. */ - u32 return_mark; /**< SKB mark associated with this rule for return direction. */ -}; - -/** - * DSCP connection rule structure. - */ -struct sfe_dscp_rule { - u8 flow_dscp; /**< Egress DSCP value for flow direction. */ - u8 return_dscp; /**< Egress DSCP value for return direction. */ - u8 reserved[2]; /**< Reserved; padding for alignment. */ -}; - -/** - * VLAN connection rule structure. - */ -struct sfe_vlan_rule { - u32 ingress_vlan_tag; /**< VLAN tag for ingress packets. */ - u32 egress_vlan_tag; /**< VLAN tag for egress packets. */ -}; - -/** - * Acceleration direction rule structure. - * Sometimes it is useful to accelerate traffic in one direction and not in another. - */ -struct sfe_acceleration_direction_rule { - u8 flow_accel; /**< Accelerate in flow direction. */ - u8 return_accel; /**< Accelerate in return direction. */ - u8 reserved[2]; /**< Reserved; padding for alignment. */ -}; - -/** - * Service class rule information in both directions. - */ -struct sfe_service_class_rule { - uint32_t flow_mark; /**< Service class information in flow direction. */ - uint32_t return_mark; /**< Service class information in return direction. */ -}; - -/** - * IPv4 rule create submessage structure. - */ -struct sfe_ipv4_rule_create_msg { - /* Request */ - u16 valid_flags; /**< Bit flags associated with paramater validity. */ - u16 rule_flags; /**< Bit flags associated with the rule. */ - - struct sfe_ipv4_5tuple tuple; /**< Holds values of 5-tuple. */ - - struct sfe_ipv4_connection_rule conn_rule; /**< Basic connection-specific data. */ - struct sfe_protocol_tcp_rule tcp_rule; /**< TCP-related acceleration parameters. */ - struct sfe_pppoe_rule pppoe_rule; /**< PPPoE-related acceleration parameters. */ - struct sfe_qos_rule qos_rule; /**< QoS-related acceleration parameters. */ - struct sfe_src_mac_rule src_mac_rule; /**< Source MAC address rule. */ - struct sfe_mark_rule mark_rule; /**< SKB mark-related acceleration parameters. */ - struct sfe_dscp_rule dscp_rule; /**< DSCP-related acceleration parameters. */ - struct sfe_vlan_rule vlan_primary_rule; /**< Primary VLAN-related acceleration parameters. */ - struct sfe_vlan_rule vlan_secondary_rule; /**< Secondary VLAN-related acceleration parameters. */ -#ifdef CONFIG_XFRM - struct sfe_acceleration_direction_rule direction_rule; - /**< Direction related acceleration parameters. */ -#endif - /* Response */ - struct sfe_service_class_rule sawf_rule; - /**< Service class related information */ - u32 index; /**< Slot ID for cache statistics to host OS. */ -}; - -/** - * IPv4 rule destroy submessage structure. - */ -struct sfe_ipv4_rule_destroy_msg { - struct sfe_ipv4_5tuple tuple; /**< Holds values of 5-tuple. */ -}; - -/** - * The SFE IPv4 rule sync structure. - */ -struct sfe_ipv4_conn_sync { - u32 index; /**< Slot ID for cache statistics to host OS. */ - u8 protocol; /**< Protocol number. */ - __be32 flow_ip; /**< Flow IP address. */ - __be32 flow_ip_xlate; /**< Translated flow IP address. */ - __be16 flow_ident; /**< Flow identifier, e.g., port. */ - __be16 flow_ident_xlate; /**< Translated flow identifier, e.g., port. */ - u32 flow_max_window; /**< Flow direction's largest seen window. */ - u32 flow_end; /**< Flow direction's largest seen sequence + segment length. */ - u32 flow_max_end; /**< Flow direction's largest seen ack + max(1, win). */ - u32 flow_rx_packet_count; /**< Flow interface's Rx packet count. */ - u32 flow_rx_byte_count; /**< Flow interface's Rx byte count. */ - u32 flow_tx_packet_count; /**< Flow interface's Tx packet count. */ - u32 flow_tx_byte_count; /**< Flow interface's Tx byte count. */ - u16 flow_pppoe_session_id; /**< Flow interface`s PPPoE session ID. */ - u16 flow_pppoe_remote_mac[3]; /**< Flow interface's PPPoE remote server MAC address (if present). */ - __be32 return_ip; /**< Return IP address. */ - __be32 return_ip_xlate; /**< Translated return IP address */ - __be16 return_ident; /**< Return identifier, e.g., port. */ - __be16 return_ident_xlate; /**< Translated return identifier, e.g., port. */ - u32 return_max_window; /**< Return direction's largest seen window. */ - u32 return_end; /**< Return direction's largest seen sequence + segment length. */ - u32 return_max_end; /**< Return direction's largest seen ack + max(1, win). */ - u32 return_rx_packet_count; /**< Return interface's Rx packet count. */ - u32 return_rx_byte_count; /**< Return interface's Rx byte count. */ - u32 return_tx_packet_count; /**< Return interface's Tx packet count. */ - u32 return_tx_byte_count; /**< Return interface's Tx byte count. */ - u16 return_pppoe_session_id; /**< Return interface`s PPPoE session ID. */ - u16 return_pppoe_remote_mac[3]; /**< Return interface's PPPoE remote server MAC address (if present). */ - u32 inc_ticks; /**< Number of ticks since the last sync. */ - u32 reason; /**< Synchronization reason. */ - - u8 flags; /**< Bit flags associated with the rule. */ - u32 qos_tag; /**< QoS tag. */ - u32 cause; /**< Flush cause. */ -}; - -/** - * Information for a multiple IPv4 connection statistics synchronization message. - */ -struct sfe_ipv4_conn_sync_many_msg { - /* - * Request - */ - uint16_t index; /**< Request connection statistics from the index. */ - uint16_t size; /**< Buffer size of this message. */ - - /* - * Response - */ - uint16_t next; /**< Firmware response for the next connection to be requested. */ - uint16_t count; /**< Number of synchronized connections included in this message. */ - struct sfe_ipv4_conn_sync conn_sync[]; /**< Array for the statistics. */ -}; - -/** - * Message structure to send/receive IPv4 bridge/route commands - */ -struct sfe_ipv4_msg { - struct sfe_cmn_msg cm; /**< Message header. */ - union { - struct sfe_ipv4_rule_create_msg rule_create; /**< Rule create message. */ - struct sfe_ipv4_rule_destroy_msg rule_destroy; /**< Rule destroy message. */ - struct sfe_ipv4_conn_sync conn_stats; /**< Connection statistics synchronization message. */ - struct sfe_ipv4_conn_sync_many_msg conn_stats_many; - /**< Many connections' statistics synchronization message. */ - } msg; /**< IPv4 message. */ -}; - -/** - * @} - */ - -/** - * @addtogroup nss_sfe_functions - * @{ - */ - -/** - * IPv4 message received callback. - */ -typedef void (*sfe_ipv4_msg_callback_t)(void *app_data, struct sfe_ipv4_msg *msg); - -/** - * @} - */ - -/** - * @addtogroup nss_sfe_datatypes - * @{ - */ - -/** - * IPv6 5-tuple structure. - */ -struct sfe_ipv6_5tuple { - __be32 flow_ip[4]; /**< Flow IP address. */ - __be32 return_ip[4]; /**< Return IP address. */ - __be16 flow_ident; /**< Flow identifier, e.g.,TCP/UDP port. */ - __be16 return_ident; /**< Return identifier, e.g., TCP/UDP port. */ - u8 protocol; /**< Protocol number. */ - u8 reserved[3]; /**< Reserved; padding for alignment. */ -}; - -/** - * IPv6 connection rule structure. - */ -struct sfe_ipv6_connection_rule { - u8 flow_mac[6]; /**< Flow MAC address. */ - u8 return_mac[6]; /**< Return MAC address. */ - s32 flow_interface_num; /**< Flow interface number. */ - s32 return_interface_num; /**< Return interface number. */ - s32 flow_top_interface_num; /**< Top flow interface number. */ - s32 return_top_interface_num; /**< Top return interface number. */ - u32 flow_mtu; /**< Flow interface's MTU. */ - u32 return_mtu; /**< Return interface's MTU. */ -}; - -/** - * IPv6 rule create submessage structure. - */ -struct sfe_ipv6_rule_create_msg { - /* - * Request - */ - u16 valid_flags; /**< Bit flags associated with parameter validity. */ - u16 rule_flags; /**< Bit flags associated with the rule. */ - struct sfe_ipv6_5tuple tuple; /**< Holds values of the sfe_ipv6_5tuple tuple. */ - struct sfe_ipv6_connection_rule conn_rule; /**< Basic connection-specific data. */ - struct sfe_protocol_tcp_rule tcp_rule; /**< Protocol-related acceleration parameters. */ - struct sfe_pppoe_rule pppoe_rule; /**< PPPoE-related acceleration parameters. */ - struct sfe_qos_rule qos_rule; /**< QoS-related acceleration parameters. */ - struct sfe_src_mac_rule src_mac_rule; /**< Source MAC address rule. */ - struct sfe_mark_rule mark_rule; /**< SKB mark-related acceleration parameters. */ - struct sfe_dscp_rule dscp_rule; /**< DSCP-related acceleration parameters. */ - struct sfe_vlan_rule vlan_primary_rule; /**< VLAN-related acceleration parameters. */ - struct sfe_vlan_rule vlan_secondary_rule; /**< VLAN-related acceleration parameters. */ -#ifdef CONFIG_XFRM - struct sfe_acceleration_direction_rule direction_rule; - /**< Direction-related acceleration parameters. */ -#endif - /* - * Response - */ - struct sfe_service_class_rule sawf_rule; /**< Service class related information. */ - u32 index; /**< Slot ID for cache statistics to host OS. */ -}; - -/** - * IPv6 rule destroy submessage structure. - */ -struct sfe_ipv6_rule_destroy_msg { - struct sfe_ipv6_5tuple tuple; /**< Holds values of the sfe_ipv6_5tuple tuple */ -}; - -/** - * SFE IPv6 rule sync structure. - */ -struct sfe_ipv6_conn_sync { - u32 index; /**< Slot ID for cache statistics to host OS. */ - u8 protocol; /**< Protocol number. */ - __be32 flow_ip[4]; /**< Flow IP address. */ - __be16 flow_ident; /**< Flow identifier, e.g., port. */ - u32 flow_max_window; /**< Flow direction's largest seen window. */ - u32 flow_end; /**< Flow direction's largest seen sequence + segment length. */ - u32 flow_max_end; /**< Flow direction's largest seen ack + max(1, win). */ - u32 flow_rx_packet_count; /**< Flow interface's Rx packet count. */ - u32 flow_rx_byte_count; /**< Flow interface's Rx byte count. */ - u32 flow_tx_packet_count; /**< Flow interface's Tx packet count. */ - u32 flow_tx_byte_count; /**< Flow interface's Tx byte count. */ - u16 flow_pppoe_session_id; /**< Flow interface`s PPPoE session ID. */ - u16 flow_pppoe_remote_mac[3]; /**< Flow interface's PPPoE remote server MAC address (if present). */ - __be32 return_ip[4]; /**< Return IP address. */ - __be16 return_ident; /**< Return identifer, e.g., port. */ - u32 return_max_window; /**< Return direction's largest seen window. */ - u32 return_end; /**< Return direction's largest seen sequence + segment length. */ - u32 return_max_end; /**< Return direction's largest seen ack + max(1, win). */ - u32 return_rx_packet_count; /**< Return interface's Rx packet count. */ - u32 return_rx_byte_count; /**< Return interface's Rx byte count. */ - u32 return_tx_packet_count; /**< Return interface's Tx packet count. */ - u32 return_tx_byte_count; /**< Return interface's Tx byte count. */ - u16 return_pppoe_session_id; /**< Return interface`s PPPoE session ID. */ - u16 return_pppoe_remote_mac[3]; /**< Return interface's PPPoE remote server MAC address (if present). */ - u32 inc_ticks; /**< Number of ticks since the last sync. */ - u32 reason; /**< Sync reason. */ - u8 flags; /**< Bit flags associated with the rule. */ - u32 qos_tag; /**< QoS tag. */ - u32 cause; /**< Flush cause associated with the rule. */ -}; - -/** - * Information for a multiple IPv6 connection statistics synchronization message. - */ -struct sfe_ipv6_conn_sync_many_msg { - /* - * Request: - */ - uint16_t index; /**< Request connection statistics from the index. */ - uint16_t size; /**< Buffer size of this message. */ - - /* - * Response: - */ - uint16_t next; /**< Firmware response for the next connection to be requested. */ - uint16_t count; /**< Number of synchronized connections included in this message. */ - struct sfe_ipv6_conn_sync conn_sync[]; /**< Array for the statistics. */ -}; - -/** - * Message structure to send/receive IPv6 bridge/route commands. - */ -struct sfe_ipv6_msg { - struct sfe_cmn_msg cm; /**< Message header. */ - union { - struct sfe_ipv6_rule_create_msg rule_create; - /**< Rule create message. */ - struct sfe_ipv6_rule_destroy_msg rule_destroy; - /**< Rule destroy message. */ - struct sfe_ipv6_conn_sync conn_stats; - /**< Statistics synchronization message. */ - struct sfe_ipv6_conn_sync_many_msg conn_stats_many; - /**< Many Connections' statistics synchronizaion message. */ - } msg; /**< IPv6 message. */ -}; - -/** - * @} - */ - -/** - * @addtogroup nss_sfe_functions - * @{ - */ - -/** - * IPv6 message received callback. - */ -typedef void (*sfe_ipv6_msg_callback_t)(void *app_data, struct sfe_ipv6_msg *msg); - -/** - * @} - */ - -/** - * @addtogroup nss_sfe_datatypes - * @{ - */ - -/** - * 6rd tunnel peer address. - */ -struct sfe_tun6rd_set_peer_msg { - __be32 ipv6_address[4]; /**< The peer's IPv6 address. */ - __be32 dest; /**< The peer's IPv4 address. */ -}; - -/** - * Message structure to send/receive 6rd tunnel messages. - */ -struct sfe_tun6rd_msg { - struct sfe_cmn_msg cm; /**< Message header. */ - union { - struct sfe_tun6rd_set_peer_msg peer; /**< Add or update peer message. */ - } msg; /**< 6RD tunnel message. */ -}; - -/** - * SFE context instance. - */ -struct sfe_ctx_instance { - int not_used; /**< Not used. */ -}; - -/** - * @} - */ - -/** - * @addtogroup nss_sfe_functions - * @{ - */ - -/** - * Copy the IPv4 statistics for the given service class. - * - * @param sid Service class ID - * @param bytes Pointer to where byte count should be written. - * @param packets Pointer to where packet count should be written. - * - * @return - * True if successful, false if maximum retries exceeded; bool. - */ -extern bool sfe_service_class_stats_get(uint8_t sid, uint64_t *bytes, uint64_t *packets); - -/** - * Gets the maximum number of IPv4 connections supported by the SFE acceleration engine. - * - * @return - * The maximum number of connections that can be accelerated by the SFE. - */ -int sfe_ipv4_max_conn_count(void); - -/** - * Transmits an IPv4 message to the SFE. - * - * @param sfe_ctx SFE context. - * @param msg The IPv4 message. - * - * @return - * The status of the Tx operation (#sfe_tx_status_t). - */ -extern sfe_tx_status_t sfe_ipv4_tx(struct sfe_ctx_instance *sfe_ctx, struct sfe_ipv4_msg *msg); - -/** - * Registers a notifier callback for IPv4 messages from the SFE. - * - * @param one_rule_cb The callback pointer for one rule. - * @param many_rules_cb The callback pointer for many rules. - * @param app_data The application context for this message. - * - * @return - * The SFE context (#sfe_ctx_instance). - */ -extern struct sfe_ctx_instance *sfe_ipv4_notify_register(sfe_ipv4_msg_callback_t one_rule_cb, - sfe_ipv4_msg_callback_t many_rules_cb,void *app_data); - -/** - * Unregisters a notifier callback for IPv4 messages from the SFE. - * - * @return - * None. - */ -extern void sfe_ipv4_notify_unregister(void); - -/** - * Initializes an IPv4 message. - * - * @param nim The IPv4 message pointer. - * @param if_num The interface number. - * @param type The type of the message. - * @param len The length of the message. - * @param cb The message callback. - * @param app_data The application context for this message. - * - */ -extern void sfe_ipv4_msg_init(struct sfe_ipv4_msg *nim, u16 if_num, u32 type, u32 len, - sfe_ipv4_msg_callback_t cb, void *app_data); - -/** - * Gets the maximum number of IPv6 connections supported by the SFE acceleration engine. - * - * @return - * The maximum number of connections that can be accelerated by the SFE; integer. - */ -int sfe_ipv6_max_conn_count(void); - -/** - * Transmits an IPv6 message to the SFE. - * - * @param sfe_ctx The SFE context. - * @param msg The IPv6 message. - * - * @return - * The status of the Tx operation (#sfe_tx_status_t). - */ -extern sfe_tx_status_t sfe_ipv6_tx(struct sfe_ctx_instance *sfe_ctx, struct sfe_ipv6_msg *msg); - -/** - * Registers a notifier callback for IPv6 messages from the SFE. - * - * @param one_rule_cb The callback pointer for one rule. - * @param many_rules_cb The callback pointer for many rules. - * - * @return - * The SFE context (#sfe_ctx_instance). - */ -extern struct sfe_ctx_instance *sfe_ipv6_notify_register(sfe_ipv6_msg_callback_t one_rule_cb, - sfe_ipv6_msg_callback_t many_rules_cb,void *app_data); - -/** - * Unregisters a notifier callback for IPv6 messages from the SFE. - */ -extern void sfe_ipv6_notify_unregister(void); - -/** - * Initializes an IPv6 message. - * - * @param nim The IPv6 message pointer. - * @param if_num The interface number. - * @param type The type of the message. - * @param len The length of the message. - * @param cb The message callback. - * @param app_data The application context for this message. - * - * @return - * None. - */ -extern void sfe_ipv6_msg_init(struct sfe_ipv6_msg *nim, u16 if_num, u32 type, u32 len, - sfe_ipv6_msg_callback_t cb, void *app_data); - -/** - * Transmits a 6rd tunnel message to the SFE. - * - * @param sfe_ctx The SFE context pointer. - * @param msg The 6rd tunnel message pointer. - * - * @return - * The status of the Tx operation (#sfe_tx_status_t). - */ -sfe_tx_status_t sfe_tun6rd_tx(struct sfe_ctx_instance *sfe_ctx, struct sfe_tun6rd_msg *msg); - -/** - * Initializes a 6rd tunnel message. - * - * @param ncm The 6rd tunnel message pointer. - * @param if_num The interface number. - * @param type The type of the message. - * @param len The length of the message. - * @param cb The message callback. - * @param app_data The application context for this message. - * - * @return - * None. - */ -void sfe_tun6rd_msg_init(struct sfe_tun6rd_msg *ncm, u16 if_num, u32 type, u32 len, - void *cb, void *app_data); - -/** - * Indicates whether the l2 feature flag is enabled or disabled. - * - * @return - * True if enabled; false if disabled. - */ -bool sfe_is_l2_feature_enabled(void); - -/** - * Updates mark values of an IPv4 connection. - * - * @param mark The mark object. - * - * @return - * None. - */ -void sfe_ipv4_mark_rule_update(struct sfe_connection_mark *mark); - -/** - * Updates mark values of an IPv6 connection. - * - * @param mark The mark object. - * - * @return - * None. - */ -void sfe_ipv6_mark_rule_update(struct sfe_connection_mark *mark); - -/** - * Gets the acceleration mode of PPPoE bridge. - * - * @return - * The acceleration mode. - */ -sfe_pppoe_br_accel_mode_t sfe_pppoe_get_br_accel_mode(void); - -/** - * @} - */ - -#endif /* __SFE_API_H */ diff --git a/shortcut-fe/fast-classifier/Makefile b/shortcut-fe/fast-classifier/Makefile new file mode 100644 index 000000000..09c1174dd --- /dev/null +++ b/shortcut-fe/fast-classifier/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2013-2018 The Linux Foundation. All rights reserved. +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all copies. +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=fast-classifier +PKG_RELEASE:=6 + +include $(INCLUDE_DIR)/package.mk + +define KernelPackage/fast-classifier/Default + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=+kmod-ipt-conntrack +kmod-shortcut-fe + TITLE:=Kernel driver for FAST Classifier + FILES:=$(PKG_BUILD_DIR)/fast-classifier.ko + KCONFIG:= \ + CONFIG_NF_CONNTRACK_EVENTS=y \ + CONFIG_NF_CONNTRACK_CHAIN_EVENTS=y \ + CONFIG_NF_CONNTRACK_MARK=y \ + CONFIG_XFRM=y + CONFLICTS:=kmod-shortcut-fe-drv kmod-shortcut-fe-cm +endef + +define KernelPackage/fast-classifier + $(call KernelPackage/fast-classifier/Default) +endef + +define KernelPackage/fast-classifier-noload + $(call KernelPackage/fast-classifier/Default) +endef + +define KernelPackage/fast-classifier/Default/description +FAST Classifier talks to SFE to make decisions about offloading connections +endef + +define KernelPackage/fast-classifier/description +$(call KernelPackage/fast-classifier/Default/description) +endef + +define KernelPackage/fast-classifier-noload/description +$(call KernelPackage/fast-classifier/Default/description) + +This package does not load fast-classifier at boot by default +endef + +define Package/fast-classifier-example + TITLE:=Example user space program for fast-classifier + DEPENDS:=+libnl +kmod-fast-classifier +endef + +define Package/fast-classifier-example/description +Example user space program that communicates with fast +classifier kernel module +endef + +HAVE_ECM:=$(CONFIG_PACKAGE_kmod-qca-nss-ecm-premium)$(CONFIG_PACKAGE_kmod-qca-nss-ecm-noload)$(CONFIG_PACKAGE_kmod-qca-nss-ecm-premium-noload)$(CONFIG_PACKAGE_kmod-qca-nss-ecm-standard) + +define Build/Compile/kmod + +$(MAKE) $(PKG_JOBS) -C "$(LINUX_DIR)" \ + $(KERNEL_MAKE_FLAGS) \ + $(PKG_MAKE_FLAGS) \ + M="$(PKG_BUILD_DIR)" \ + CONFIG_FAST_CLASSIFIER=m \ + EXTRA_CFLAGS+="-DSFE_SUPPORT_IPV6" \ + $(if $(HAVE_ECM),EXTRA_CFLAGS+="-DCONFIG_SFE_ECM" CONFIG_SFE_ECM=y,) \ + modules +endef + +define Build/Compile/example + $(TARGET_CC) -o $(PKG_BUILD_DIR)/userspace_fast_classifier \ + -I $(PKG_BUILD_DIR) \ + -I$(STAGING_DIR)/usr/include/libnl \ + -I$(STAGING_DIR)/usr/include/libnl3 \ + -lnl-genl-3 -lnl-3 \ + $(PKG_BUILD_DIR)/nl_classifier_test.c +endef + +define Build/Compile + $(Build/Compile/kmod) + $(if $(CONFIG_PACKAGE_fast-classifier-example),$(Build/Compile/example)) +endef + +define Build/InstallDev + $(INSTALL_DIR) $(1)/usr/include + $(CP) $(PKG_BUILD_DIR)/fast-classifier.h $(1)/usr/include/ +endef + +define Package/fast-classifier-example/install + $(INSTALL_DIR) $(1)/sbin + $(CP) $(PKG_BUILD_DIR)/userspace_fast_classifier $(1)/sbin/ +endef + +$(eval $(call KernelPackage,fast-classifier)) +#$(eval $(call KernelPackage,fast-classifier-noload)) +#$(eval $(call BuildPackage,fast-classifier-example)) diff --git a/shortcut-fe/fast-classifier/src/Makefile b/shortcut-fe/fast-classifier/src/Makefile new file mode 100644 index 000000000..58dd06e01 --- /dev/null +++ b/shortcut-fe/fast-classifier/src/Makefile @@ -0,0 +1,10 @@ +obj-$(CONFIG_FAST_CLASSIFIER) += fast-classifier.o + +ifeq ($(SFE_SUPPORT_IPV6),) +SFE_SUPPORT_IPV6=y +endif +ccflags-$(SFE_SUPPORT_IPV6) += -DSFE_SUPPORT_IPV6 + +ccflags-y += -I$(obj)/../shortcut-fe + +obj ?= . diff --git a/shortcut-fe/fast-classifier/src/fast-classifier.c b/shortcut-fe/fast-classifier/src/fast-classifier.c new file mode 100644 index 000000000..944dfae38 --- /dev/null +++ b/shortcut-fe/fast-classifier/src/fast-classifier.c @@ -0,0 +1,2002 @@ +/* + * fast-classifier.c + * Shortcut forwarding engine connection manager. + * fast-classifier + * + * Copyright (c) 2013-2018 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "fast-classifier.h" + +typedef enum fast_classifier_exception { + FAST_CL_EXCEPTION_PACKET_BROADCAST, + FAST_CL_EXCEPTION_PACKET_MULTICAST, + FAST_CL_EXCEPTION_NO_IIF, + FAST_CL_EXCEPTION_NO_CT, + FAST_CL_EXCEPTION_CT_NO_TRACK, + FAST_CL_EXCEPTION_CT_NO_CONFIRM, + FAST_CL_EXCEPTION_CT_IS_ALG, + FAST_CL_EXCEPTION_IS_IPV4_MCAST, + FAST_CL_EXCEPTION_IS_IPV6_MCAST, + FAST_CL_EXCEPTION_TCP_NOT_ASSURED, + FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED, + FAST_CL_EXCEPTION_UNKNOW_PROTOCOL, + FAST_CL_EXCEPTION_NO_SRC_DEV, + FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV, + FAST_CL_EXCEPTION_NO_DEST_DEV, + FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV, + FAST_CL_EXCEPTION_NO_BRIDGE, + FAST_CL_EXCEPTION_LOCAL_OUT, + FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION, + FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL, + FAST_CL_EXCEPTION_CT_DESTROY_MISS, + FAST_CL_EXCEPTION_MAX +} fast_classifier_exception_t; + +static char *fast_classifier_exception_events_string[FAST_CL_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT", + "WAIT_FOR_ACCELERATION", + "UPDATE_PROTOCOL_FAIL", + "CT_DESTROY_MISS", +}; + +/* + * Per-module structure. + */ +struct fast_classifier { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_fast_classifier; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[FAST_CL_EXCEPTION_MAX]; +}; + +static struct fast_classifier __sc; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)) +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { + .type = NLA_UNSPEC, + .len = sizeof(struct fast_classifier_tuple) + }, +}; +#endif /*KERNEL_VERSION(5, 2, 0)*/ + +static struct genl_multicast_group fast_classifier_genl_mcgrp[] = { + { + .name = FAST_CLASSIFIER_GENL_MCGRP, + }, +}; + +static int fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info); +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, struct netlink_callback *cb); + +static struct genl_ops fast_classifier_gnl_ops[] = { + { + .cmd = FAST_CLASSIFIER_C_OFFLOAD, + .flags = 0, +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)) + .policy = fast_classifier_genl_policy, +#endif /*KERNEL_VERSION(5, 2, 0)*/ + .doit = fast_classifier_offload_genl_msg, + .dumpit = NULL, + }, + { + .cmd = FAST_CLASSIFIER_C_OFFLOADED, + .flags = 0, +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)) + .policy = fast_classifier_genl_policy, +#endif /*KERNEL_VERSION(5, 2, 0)*/ + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, + { + .cmd = FAST_CLASSIFIER_C_DONE, + .flags = 0, +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0)) + .policy = fast_classifier_genl_policy, +#endif /*KERNEL_VERSION(5, 2, 0)*/ + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, +}; + +static struct genl_family fast_classifier_gnl_family = { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) + .id = GENL_ID_GENERATE, +#endif /*KERNEL_VERSION(4, 10, 0)*/ + .hdrsize = FAST_CLASSIFIER_GENL_HDRSIZE, + .name = FAST_CLASSIFIER_GENL_NAME, + .version = FAST_CLASSIFIER_GENL_VERSION, + .maxattr = FAST_CLASSIFIER_A_MAX, +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)) + .ops = fast_classifier_gnl_ops, + .n_ops = ARRAY_SIZE(fast_classifier_gnl_ops), + .mcgrps = fast_classifier_genl_mcgrp, + .n_mcgrps = ARRAY_SIZE(fast_classifier_genl_mcgrp), +#endif /*KERNEL_VERSION(4, 10, 0)*/ +}; + +static atomic_t offload_msgs = ATOMIC_INIT(0); +static atomic_t offload_no_match_msgs = ATOMIC_INIT(0); +static atomic_t offloaded_msgs = ATOMIC_INIT(0); +static atomic_t done_msgs = ATOMIC_INIT(0); + +static atomic_t offloaded_fail_msgs = ATOMIC_INIT(0); +static atomic_t done_fail_msgs = ATOMIC_INIT(0); + +/* + * Accelerate incoming packets destined for bridge device + * If a incoming packet is ultimatly destined for + * a bridge device we will first see the packet coming + * from the phyiscal device, we can skip straight to + * processing the packet like it came from the bridge + * for some more performance gains + * + * This only works when the hook is above the bridge. We + * only implement ingress for now, because for egress we + * want to have the bridge devices qdiscs be used. + */ +static bool skip_to_bridge_ingress; + +/* + * fast_classifier_incr_exceptions() + * increase an exception counter. + */ +static inline void fast_classifier_incr_exceptions(fast_classifier_exception_t except) +{ + struct fast_classifier *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * fast_classifier_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int fast_classifier_recv(struct sk_buff *skb) +{ + struct net_device *dev; + struct net_device *master_dev = NULL; + int ret = 0; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * Process packet like it arrived on the bridge device + */ + if (skip_to_bridge_ingress && + (dev->priv_flags & IFF_BRIDGE_PORT)) { + master_dev = sfe_dev_get_master(dev); + if (!master_dev) { + DEBUG_WARN("master dev is NULL %s\n", dev->name); + goto rx_exit; + } + dev = master_dev; + } + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv4_recv(dev, skb); + + } else if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv6_recv(dev, skb); + + } else { + DEBUG_TRACE("not IP packet\n"); + } + +rx_exit: + if (master_dev) { + dev_put(master_dev); + } + + return ret; +} + +/* + * fast_classifier_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool fast_classifier_find_dev_and_mac_addr(struct sk_buff *skb, sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, bool is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * If we have skb provided, use it as the original code is unable + * to lookup routes that are policy routed. + */ + if (unlikely(skb)) { + dst = skb_dst(skb); + goto skip_dst_lookup; + } + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this works when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } +#ifdef SFE_SUPPORT_IPV6 + else { +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)) + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, NULL, 0); +#else + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); +#endif /*KERNEL_VERSION(4, 17, 0)*/ + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } +#endif + +skip_dst_lookup: + rcu_read_lock(); + neigh = sfe_dst_get_neighbour(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + if (likely(!skb)) + dst_release(dst); + + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + if (likely(!skb)) + dst_release(dst); + + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + if (likely(!skb)) + dst_release(dst); + + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + if (likely(!skb)) + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", addr); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr); + } + + return false; +} + +static DEFINE_SPINLOCK(sfe_connections_lock); + +struct sfe_connection { + struct hlist_node hl; + struct sfe_connection_create *sic; + struct nf_conn *ct; + int hits; + int offload_permit; + int offloaded; + bool is_v4; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; + +static int sfe_connections_size; + +#define FC_CONN_HASH_ORDER 13 +static DEFINE_HASHTABLE(fc_conn_ht, FC_CONN_HASH_ORDER); + +static u32 fc_conn_hash(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, bool is_v4) +{ + u32 idx, cnt = ((is_v4 ? sizeof(saddr->ip) : sizeof(saddr->ip6))/sizeof(u32)); + u32 hash = 0; + + for (idx = 0; idx < cnt; idx++) { + hash ^= ((u32 *)saddr)[idx] ^ ((u32 *)daddr)[idx]; + } + + return hash ^ (sport | (dport << 16)); +} + +/* + * fast_classifier_update_protocol() + * Update sfe_ipv4_create struct with new protocol information before we offload + */ +static int fast_classifier_update_protocol(struct sfe_connection_create *p_sic, struct nf_conn *ct) +{ + #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + struct net *net=NULL ; + struct nf_tcp_net *tn=NULL; + #endif + switch (p_sic->protocol) { + case IPPROTO_TCP: + p_sic->src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + p_sic->src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + p_sic->src_td_end = ct->proto.tcp.seen[0].td_end; + p_sic->src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + p_sic->dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + p_sic->dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + p_sic->dest_td_end = ct->proto.tcp.seen[1].td_end; + p_sic->dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + net = nf_ct_net(ct); + tn = nf_tcp_pernet(net); + if ((tn&&tn->tcp_no_window_check) +#else + if (nf_ct_tcp_no_window_check +#endif + + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + p_sic->flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock(&ct->lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &p_sic->src_ip, ntohs(p_sic->src_port), + &p_sic->dest_ip, ntohs(p_sic->dest_port)); + return 0; + } + spin_unlock(&ct->lock); + break; + + case IPPROTO_UDP: + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", p_sic->protocol); + return 0; + } + + return 1; +} + +/* fast_classifier_send_genl_msg() + * Function to send a generic netlink message + */ +static void fast_classifier_send_genl_msg(int msg, struct fast_classifier_tuple *fc_msg) +{ + struct sk_buff *skb; + int rc; + int buf_len; + int total_len; + void *msg_head; + + /* + * Calculate our packet payload size. + * Start with our family header. + */ + buf_len = fast_classifier_gnl_family.hdrsize; + + /* + * Add the nla_total_size of each attribute we're going to nla_put(). + */ + buf_len += nla_total_size(sizeof(*fc_msg)); + + /* + * Lastly we need to add space for the NL message header since + * genlmsg_new only accounts for the GENL header and not the + * outer NL header. To do this, we use a NL helper function which + * calculates the total size of a netlink message given a payload size. + * Note this value does not include the GENL header, but that's + * added automatically by genlmsg_new. + */ + total_len = nlmsg_total_size(buf_len); + skb = genlmsg_new(total_len, GFP_ATOMIC); + if (!skb) + return; + + msg_head = genlmsg_put(skb, 0, 0, &fast_classifier_gnl_family, 0, msg); + if (!msg_head) { + nlmsg_free(skb); + return; + } + + rc = nla_put(skb, FAST_CLASSIFIER_A_TUPLE, sizeof(struct fast_classifier_tuple), fc_msg); + if (rc != 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(3, 19 , 0)) + rc = genlmsg_end(skb, msg_head); + if (rc < 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } +#else + genlmsg_end(skb, msg_head); + +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + rc = genlmsg_multicast(&fast_classifier_gnl_family, skb, 0, 0, GFP_ATOMIC); +#else + rc = genlmsg_multicast(skb, 0, fast_classifier_genl_mcgrp[0].id, GFP_ATOMIC); +#endif + switch (msg) { + case FAST_CLASSIFIER_C_OFFLOADED: + if (rc == 0) { + atomic_inc(&offloaded_msgs); + } else { + atomic_inc(&offloaded_fail_msgs); + } + break; + case FAST_CLASSIFIER_C_DONE: + if (rc == 0) { + atomic_inc(&done_msgs); + } else { + atomic_inc(&done_fail_msgs); + } + break; + default: + DEBUG_ERROR("fast-classifer: Unknown message type sent!\n"); + break; + } + + DEBUG_TRACE("Notify NL message %d ", msg); + if (fc_msg->ethertype == AF_INET) { + DEBUG_TRACE("sip=%pI4 dip=%pI4 ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + } else { + DEBUG_TRACE("sip=%pI6 dip=%pI6 ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + } + DEBUG_TRACE("protocol=%d sport=%d dport=%d smac=%pM dmac=%pM\n", + fc_msg->proto, fc_msg->sport, fc_msg->dport, fc_msg->smac, fc_msg->dmac); +} + +/* + * fast_classifier_find_conn() + * find a connection object in the hash table + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip, daddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_sb_find_conn() + * find a connection object in the hash table according to information of packet + * if not found, reverse the tuple and try again. + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_sb_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port_xlate == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, daddr, is_v4)) { + return conn; + } + } + + /* + * Reverse the tuple and try again + */ + key = fc_conn_hash(daddr, saddr, dport, sport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == dport && + p_sic->dest_port_xlate == sport && + sfe_addr_equal(&p_sic->src_ip, daddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, saddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_add_conn() + * add a connection object in the hash table if no duplicate + * @conn connection to add + * @return conn if successful, NULL if duplicate + */ +static struct sfe_connection * +fast_classifier_add_conn(struct sfe_connection *conn) +{ + struct sfe_connection_create *sic = conn->sic; + u32 key; + + spin_lock_bh(&sfe_connections_lock); + if (fast_classifier_find_conn(&sic->src_ip, &sic->dest_ip, sic->src_port, + sic->dest_port, sic->protocol, conn->is_v4)) { + spin_unlock_bh(&sfe_connections_lock); + return NULL; + } + + key = fc_conn_hash(&sic->src_ip, &sic->dest_ip, + sic->src_port, sic->dest_port, conn->is_v4); + + hash_add(fc_conn_ht, &conn->hl, key); + sfe_connections_size++; + spin_unlock_bh(&sfe_connections_lock); + + DEBUG_TRACE(" -> adding item to sfe_connections, new size: %d\n", sfe_connections_size); + + if (conn->is_v4) { + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + } else { + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + } + + return conn; +} + +/* + * fast_classifier_offload_genl_msg() + * Called from user space to offload a connection + */ +static int +fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *na; + struct fast_classifier_tuple *fc_msg; + struct sfe_connection *conn; + + na = info->attrs[FAST_CLASSIFIER_A_TUPLE]; + fc_msg = nla_data(na); + + if (fc_msg->ethertype == AF_INET) { + DEBUG_TRACE("want to offload: %d-%d, %pI4, %pI4, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + } else { + DEBUG_TRACE("want to offload: %d-%d, %pI6, %pI6, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + } + + spin_lock_bh(&sfe_connections_lock); + conn = fast_classifier_sb_find_conn((sfe_ip_addr_t *)&fc_msg->src_saddr, + (sfe_ip_addr_t *)&fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->proto, + (fc_msg->ethertype == AF_INET)); + if (!conn) { + spin_unlock_bh(&sfe_connections_lock); + DEBUG_TRACE("REQUEST OFFLOAD NO MATCH\n"); + atomic_inc(&offload_no_match_msgs); + return 0; + } + + conn->offload_permit = 1; + spin_unlock_bh(&sfe_connections_lock); + atomic_inc(&offload_msgs); + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + return 0; +} + +/* + * fast_classifier_nl_genl_msg_DUMP() + * ignore fast_classifier_messages OFFLOADED and DONE + */ +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return 0; +} + +/* auto offload connection once we have this many packets*/ +static int offload_at_pkts = 128; + +/* + * fast_classifier_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int fast_classifier_post_routing(struct sk_buff *skb, bool is_v4) +{ + int ret; + struct sfe_connection_create sic; + struct sfe_connection_create *p_sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_dev_tmp; + struct net_device *dest_dev_tmp; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + struct sfe_connection *conn; + struct sk_buff *tmp_skb = NULL; + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + /* + * Don't process untracked connections. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } +#endif /*KERNEL_VERSION(4, 12, 0)*/ + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + memset(&sic, 0, sizeof(sic)); + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + + /* + * Somehow, SFE is not playing nice with IPSec traffic. + * Do not accelerate for now. + */ + if (ntohs(sic.dest_port) == 4500 || ntohs(sic.dest_port) == 500) { + if (likely(is_v4)) + DEBUG_TRACE("quarkysg:: IPsec bypass: %pI4:%d(%pI4:%d) to %pI4:%d(%pI4:%d)\n", + &sic.src_ip.ip, ntohs(sic.src_port), &sic.src_ip_xlate.ip, ntohs(sic.src_port_xlate), + &sic.dest_ip.ip, ntohs(sic.dest_port), &sic.dest_ip_xlate.ip, ntohs(sic.dest_port_xlate)); + else + DEBUG_TRACE("quarkysg:: IPsec bypass: %pI6:%d to %pI6:%d\n", + &sic.src_ip.ip6, ntohs(sic.src_port), &sic.dest_ip.ip6, ntohs(sic.dest_port)); + return NF_ACCEPT; + } + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + if (is_v4) { + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + } else { + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + } + + /* + * If we already have this connection in our list, skip it + * XXX: this may need to be optimized + */ + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port, sic.protocol, is_v4); + if (conn) { + conn->hits++; + + if (!conn->offloaded) { + if (conn->offload_permit || conn->hits >= offload_at_pkts) { + DEBUG_TRACE("OFFLOADING CONNECTION, TOO MANY HITS\n"); + + if (fast_classifier_update_protocol(conn->sic, conn->ct) == 0) { + spin_unlock_bh(&sfe_connections_lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL); + DEBUG_TRACE("UNKNOWN PROTOCOL OR CONNECTION CLOSING, SKIPPING\n"); + return NF_ACCEPT; + } + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + spin_unlock_bh(&sfe_connections_lock); + + ret = is_v4 ? sfe_ipv4_create_rule(conn->sic) : sfe_ipv6_create_rule(conn->sic); + if ((ret == 0) || (ret == -EADDRINUSE)) { + struct fast_classifier_tuple fc_msg; + + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&sic.src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&sic.dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&sic.src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&sic.dest_ip_xlate); + } + + fc_msg.proto = sic.protocol; + fc_msg.sport = sic.src_port; + fc_msg.dport = sic.dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_OFFLOADED, &fc_msg); + conn->offloaded = 1; + } + + return NF_ACCEPT; + } + } + + spin_unlock_bh(&sfe_connections_lock); + if (conn->offloaded) { + is_v4 ? sfe_ipv4_update_rule(conn->sic) : sfe_ipv6_update_rule(conn->sic); + } + + DEBUG_TRACE("FOUND, SKIPPING\n"); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION); + return NF_ACCEPT; + } + + spin_unlock_bh(&sfe_connections_lock); + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!fast_classifier_find_dev_and_mac_addr(NULL, &sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + src_dev = src_dev_tmp; + + if (!fast_classifier_find_dev_and_mac_addr(NULL, &sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + dev_put(dev); + + if (unlikely(!is_v4)) + tmp_skb = skb; + + if (!fast_classifier_find_dev_and_mac_addr(tmp_skb, &sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_DEV); + goto done1; + } + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(skb, &sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + dest_dev = dest_dev_tmp; + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (skb->mark) { + DEBUG_TRACE("SKB MARK NON ZERO %x\n", skb->mark); + } + sic.mark = skb->mark; + + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (!conn) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + goto done4; + } + conn->hits = 0; + conn->offload_permit = 0; + conn->offloaded = 0; + conn->is_v4 = is_v4; + DEBUG_TRACE("Source MAC=%pM\n", sic.src_mac); + memcpy(conn->smac, sic.src_mac, ETH_ALEN); + memcpy(conn->dmac, sic.dest_mac_xlate, ETH_ALEN); + + p_sic = kmalloc(sizeof(*p_sic), GFP_ATOMIC); + if (!p_sic) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + kfree(conn); + goto done4; + } + + memcpy(p_sic, &sic, sizeof(sic)); + conn->sic = p_sic; + conn->ct = ct; + + if (!fast_classifier_add_conn(conn)) { + kfree(conn->sic); + kfree(conn); + } + + /* + * If we had bridge ports then release them too. + */ +done4: + if (dest_br_dev) { + dev_put(dest_br_dev); + } +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } +done2: + dev_put(dest_dev_tmp); +done1: + dev_put(src_dev_tmp); + + return NF_ACCEPT; +} + +/* + * fast_classifier_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, true); +} + +/* + * fast_classifier_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, false); +} + +/* + * fast_classifier_update_mark() + * updates the mark for a fast-classifier connection + */ +static void fast_classifier_update_mark(struct sfe_connection_mark *mark, bool is_v4) +{ + struct sfe_connection *conn; + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&mark->src_ip, &mark->dest_ip, + mark->src_port, mark->dest_port, + mark->protocol, is_v4); + if (conn) { + conn->sic->mark = mark->mark; + } + + spin_unlock_bh(&sfe_connections_lock); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * fast_classifier_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int fast_classifier_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int fast_classifier_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + struct sfe_connection *conn; + struct fast_classifier_tuple fc_msg; + int offloaded = 0; + bool is_v4; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + /* + * If this is an untracked connection then we can't have any state either. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } +#endif /*KERNEL_VERSION(4, 12, 0)*/ + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + is_v4 = true; + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + is_v4 = false; + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + return NOTIFY_DONE; + } + + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + /* + * Check for an updated mark + */ + if ((events & (1 << IPCT_MARK)) && (ct->mark != 0)) { + struct sfe_connection_mark mark; + + mark.protocol = sid.protocol; + mark.src_ip = sid.src_ip; + mark.dest_ip = sid.dest_ip; + mark.src_port = sid.src_port; + mark.dest_port = sid.dest_port; + mark.mark = ct->mark; + + is_v4 ? sfe_ipv4_mark_rule(&mark) : sfe_ipv6_mark_rule(&mark); + fast_classifier_update_mark(&mark, is_v4); + } + + /* + * We're only interested in destroy events at this point + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + if (is_v4) { + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, ntohs(sid.src_port), ntohs(sid.dest_port)); + } else { + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, ntohs(sid.src_port), ntohs(sid.dest_port)); + } + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port, sid.protocol, is_v4); + if (conn && conn->offloaded) { + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&conn->sic->dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&conn->sic->dest_ip_xlate); + } + + fc_msg.proto = conn->sic->protocol; + fc_msg.sport = conn->sic->src_port; + fc_msg.dport = conn->sic->dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + offloaded = 1; + } + + if (conn) { + DEBUG_TRACE("Free connection\n"); + + hash_del(&conn->hl); + sfe_connections_size--; + kfree(conn->sic); + kfree(conn); + } else { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_DESTROY_MISS); + } + + spin_unlock_bh(&sfe_connections_lock); + + is_v4 ? sfe_ipv4_destroy_rule(&sid) : sfe_ipv6_destroy_rule(&sid); + + if (offloaded) { + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_DONE, &fc_msg); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block fast_classifier_conntrack_notifier = { + .notifier_call = fast_classifier_conntrack_event, +}; +#else +static struct nf_ct_event_notifier fast_classifier_conntrack_notifier = { + .fcn = fast_classifier_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops fast_classifier_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__fast_classifier_ipv4_post_routing_hook), + SFE_IPV6_NF_POST_ROUTING_HOOK(__fast_classifier_ipv6_post_routing_hook), +}; + +/* + * fast_classifier_sync_rule() + * Synchronize a connection's state. + */ +static void fast_classifier_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Update packet count for ingress on bridge device + */ + if (skip_to_bridge_ingress) { + struct rtnl_link_stats64 nlstats; + nlstats.tx_packets = 0; + nlstats.tx_bytes = 0; + + if (sis->src_dev && IFF_EBRIDGE && + (sis->src_new_packet_count || sis->src_new_byte_count)) { + nlstats.rx_packets = sis->src_new_packet_count; + nlstats.rx_bytes = sis->src_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->src_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + if (sis->dest_dev && IFF_EBRIDGE && + (sis->dest_new_packet_count || sis->dest_new_byte_count)) { + nlstats.rx_packets = sis->dest_new_packet_count; + nlstats.rx_bytes = sis->dest_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->dest_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); +#endif /*KERNEL_VERSION(4, 9, 0)*/ + + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) + ct->timeout += sis->delta_jiffies; +#else + ct->timeout.expires += sis->delta_jiffies; +#endif /*KERNEL_VERSION(4, 9, 0)*/ + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * fast_classifier_device_event() + */ +static int fast_classifier_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet_event() + */ +static int fast_classifier_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet6_event() + */ +static int fast_classifier_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_get_offload_at_pkts() + */ +static ssize_t fast_classifier_get_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", offload_at_pkts); +} + +/* + * fast_classifier_set_offload_at_pkts() + */ +static ssize_t fast_classifier_set_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + offload_at_pkts = new; + + return size; +} + +/* + * fast_classifier_get_debug_info() + */ +static ssize_t fast_classifier_get_debug_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + size_t len = 0; + struct sfe_connection *conn; + u32 i; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + spin_lock_bh(&sfe_connections_lock); + len += scnprintf(buf, PAGE_SIZE - len, "size=%d offload=%d offload_no_match=%d" + " offloaded=%d done=%d offloaded_fail=%d done_fail=%d\n", + sfe_connections_size, + atomic_read(&offload_msgs), + atomic_read(&offload_no_match_msgs), + atomic_read(&offloaded_msgs), + atomic_read(&done_msgs), + atomic_read(&offloaded_fail_msgs), + atomic_read(&done_fail_msgs)); + sfe_hash_for_each(fc_conn_ht, i, node, conn, hl) { + len += scnprintf(buf + len, PAGE_SIZE - len, + (conn->is_v4 ? "o=%d, p=%d [%pM]:%pI4:%u %pI4:%u:[%pM] m=%08x h=%d\n" : "o=%d, p=%d [%pM]:%pI6:%u %pI6:%u:[%pM] m=%08x h=%d\n"), + conn->offloaded, + conn->sic->protocol, + conn->sic->src_mac, + &conn->sic->src_ip, + ntohs(conn->sic->src_port), + &conn->sic->dest_ip, + ntohs(conn->sic->dest_port), + conn->sic->dest_mac_xlate, + conn->sic->mark, + conn->hits); + } + spin_unlock_bh(&sfe_connections_lock); + + return len; +} + +/* + * fast_classifier_get_skip_bridge_ingress() + */ +static ssize_t fast_classifier_get_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", skip_to_bridge_ingress); +} + +/* + * fast_classifier_set_skip_bridge_ingress() + */ +static ssize_t fast_classifier_set_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + skip_to_bridge_ingress = new ? 1 : 0; + + return size; +} + +/* + * fast_classifier_get_exceptions + * dump exception counters + */ +static ssize_t fast_classifier_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct fast_classifier *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < FAST_CL_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", fast_classifier_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute fast_classifier_offload_at_pkts_attr = + __ATTR(offload_at_pkts, S_IWUSR | S_IRUGO, fast_classifier_get_offload_at_pkts, fast_classifier_set_offload_at_pkts); +static const struct device_attribute fast_classifier_debug_info_attr = + __ATTR(debug_info, S_IRUGO, fast_classifier_get_debug_info, NULL); +static const struct device_attribute fast_classifier_skip_bridge_ingress = + __ATTR(skip_to_bridge_ingress, S_IWUSR | S_IRUGO, fast_classifier_get_skip_bridge_ingress, fast_classifier_set_skip_bridge_ingress); +static const struct device_attribute fast_classifier_exceptions_attr = + __ATTR(exceptions, S_IRUGO, fast_classifier_get_exceptions, NULL); + +/* + * fast_classifier_init() + */ +static int __init fast_classifier_init(void) +{ + struct fast_classifier *sc = &__sc; + int result = -1; +#ifdef CONFIG_SFE_ECM + int (*fast_recv)(struct sk_buff *skb); +#endif + + printk(KERN_ALERT "fast-classifier: starting up\n"); + DEBUG_INFO("SFE CM init\n"); + + hash_init(fc_conn_ht); + + /* + * Create sys/fast_classifier + */ + sc->sys_fast_classifier = kobject_create_and_add("fast_classifier", NULL); + if (!sc->sys_fast_classifier) { + DEBUG_ERROR("failed to register fast_classifier\n"); + goto exit1; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + if (result) { + DEBUG_ERROR("failed to register offload at pkgs: %d\n", result); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + if (result) { + DEBUG_ERROR("failed to register skip bridge on ingress: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + goto exit2; + } + + sc->dev_notifier.notifier_call = fast_classifier_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = fast_classifier_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = fast_classifier_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + + /* + * Register our netfilter hooks. + */ + result = nf_register_net_hooks(&init_net, fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + /* + * Register a notifier hook to get fast notifications of expired connections. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + result = nf_conntrack_register_chain_notifier(&init_net, &fast_classifier_conntrack_notifier); +#else + result = nf_conntrack_register_notifier(&init_net, &fast_classifier_conntrack_notifier); +#endif + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif + + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)) + result = genl_register_family(&fast_classifier_gnl_family); + if (result) { + DEBUG_ERROR("failed to register genl family: %d\n", result); + goto exit5; + } +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + result = genl_register_family_with_ops_groups(&fast_classifier_gnl_family, + fast_classifier_gnl_ops, + fast_classifier_genl_mcgrp); + if (result) { + DEBUG_ERROR("failed to register genl ops: %d\n", result); + goto exit5; + } +#else + result = genl_register_family(&fast_classifier_gnl_family); + if (result) { + printk(KERN_CRIT "unable to register genl family\n"); + goto exit5; + } + + result = genl_register_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result) { + printk(KERN_CRIT "unable to register ops\n"); + goto exit6; + } + + result = genl_register_mc_group(&fast_classifier_gnl_family, + fast_classifier_genl_mcgrp); + if (result) { + printk(KERN_CRIT "unable to register multicast group\n"); + goto exit6; + } +#endif + + printk(KERN_ALERT "fast-classifier: registered\n"); + + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ +#ifdef CONFIG_SFE_ECM + rcu_read_lock(); + fast_recv = rcu_dereference(athrs_fast_nat_recv); + rcu_read_unlock(); + if (!fast_recv) { + BUG_ON(athrs_fast_nat_recv); + } +#else + BUG_ON(athrs_fast_nat_recv); +#endif + RCU_INIT_POINTER(athrs_fast_nat_recv, fast_classifier_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(fast_classifier_sync_rule); + sfe_ipv6_register_sync_rule_callback(fast_classifier_sync_rule); + return 0; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) +exit6: + genl_unregister_family(&fast_classifier_gnl_family); +#endif + +exit5: +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + nf_conntrack_unregister_chain_notifier(&init_net, &fast_classifier_conntrack_notifier); +#else + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); +#endif + +exit4: +#endif + nf_unregister_net_hooks(&init_net, fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + +exit3: + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_exceptions_attr.attr); + +exit2: + kobject_put(sc->sys_fast_classifier); + +exit1: + return result; +} + +/* + * fast_classifier_exit() + */ +static void __exit fast_classifier_exit(void) +{ + struct fast_classifier *sc = &__sc; + int result = -1; + + DEBUG_INFO("SFE CM exit\n"); + printk(KERN_ALERT "fast-classifier: shutting down\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) + result = genl_unregister_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_ops\n"); + } +#endif + + result = genl_unregister_family(&fast_classifier_gnl_family); + if (result != 0) { + printk(KERN_CRIT "Unable to unregister genl_family\n"); + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + nf_conntrack_unregister_chain_notifier(&init_net, &fast_classifier_conntrack_notifier); +#else + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); +#endif +#endif + nf_unregister_net_hooks(&init_net, fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_fast_classifier); +} + +module_init(fast_classifier_init) +module_exit(fast_classifier_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/fast-classifier/src/fast-classifier.h b/shortcut-fe/fast-classifier/src/fast-classifier.h new file mode 100644 index 000000000..6b7a18cf6 --- /dev/null +++ b/shortcut-fe/fast-classifier/src/fast-classifier.h @@ -0,0 +1,57 @@ +/* + * User space header to send message to the fast classifier + * + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#define FAST_CLASSIFIER_GENL_VERSION (1) +#define FAST_CLASSIFIER_GENL_NAME "FC" +#define FAST_CLASSIFIER_GENL_MCGRP "FC_MCGRP" +#define FAST_CLASSIFIER_GENL_HDRSIZE (0) + +enum { + FAST_CLASSIFIER_A_UNSPEC, + FAST_CLASSIFIER_A_TUPLE, + __FAST_CLASSIFIER_A_MAX, +}; + +#define FAST_CLASSIFIER_A_MAX (__FAST_CLASSIFIER_A_MAX - 1) + +enum { + FAST_CLASSIFIER_C_UNSPEC, + FAST_CLASSIFIER_C_OFFLOAD, + FAST_CLASSIFIER_C_OFFLOADED, + FAST_CLASSIFIER_C_DONE, + __FAST_CLASSIFIER_C_MAX, +}; + +#define FAST_CLASSIFIER_C_MAX (__FAST_CLASSIFIER_C_MAX - 1) + +struct fast_classifier_tuple { + unsigned short ethertype; + unsigned char proto; + union { + struct in_addr in; + struct in6_addr in6; + } src_saddr; + union { + struct in_addr in; + struct in6_addr in6; + } dst_saddr; + unsigned short sport; + unsigned short dport; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; diff --git a/shortcut-fe/fast-classifier/src/nl_classifier_test.c b/shortcut-fe/fast-classifier/src/nl_classifier_test.c new file mode 100644 index 000000000..639417964 --- /dev/null +++ b/shortcut-fe/fast-classifier/src/nl_classifier_test.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#define NL_CLASSIFIER_GENL_VERSION 1 +#define NL_CLASSIFIER_GENL_FAMILY "FC" +#define NL_CLASSIFIER_GENL_GROUP "FC_MCGRP" +#define NL_CLASSIFIER_GENL_HDRSIZE 0 + +enum NL_CLASSIFIER_CMD { + NL_CLASSIFIER_CMD_UNSPEC, + NL_CLASSIFIER_CMD_ACCEL, + NL_CLASSIFIER_CMD_ACCEL_OK, + NL_CLASSIFIER_CMD_CONNECTION_CLOSED, + NL_CLASSIFIER_CMD_MAX, +}; + +enum NL_CLASSIFIER_ATTR { + NL_CLASSIFIER_ATTR_UNSPEC, + NL_CLASSIFIER_ATTR_TUPLE, + NL_CLASSIFIER_ATTR_MAX, +}; + +union nl_classifier_tuple_ip { + struct in_addr in; + struct in6_addr in6; +}; + +struct nl_classifier_tuple { + unsigned short af; + unsigned char proto; + union nl_classifier_tuple_ip src_ip; + union nl_classifier_tuple_ip dst_ip; + unsigned short sport; + unsigned short dport; + unsigned char smac[6]; + unsigned char dmac[6]; +}; + +struct nl_classifier_instance { + struct nl_sock *sock; + int family_id; + int group_id; + int stop; +}; + +struct nl_classifier_instance nl_cls_inst; + +static struct nla_policy nl_classifier_genl_policy[(NL_CLASSIFIER_ATTR_MAX+1)] = { + [NL_CLASSIFIER_ATTR_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void nl_classifier_dump_nl_tuple(struct nl_classifier_tuple *tuple) +{ + char ip_str[64]; + + printf("protocol = %s\n", (tuple->proto == IPPROTO_UDP) ? "udp" : ((tuple->proto == IPPROTO_TCP) ? "tcp" : "unknown")); + printf("source ip = %s\n", inet_ntop(tuple->af, &tuple->src_ip, ip_str, sizeof(ip_str))); + printf("destination ip = %s\n", inet_ntop(tuple->af, &tuple->dst_ip, ip_str, sizeof(ip_str))); + printf("source port = %d\n", ntohs(tuple->sport)); + printf("destination port = %d\n", ntohs(tuple->dport)); +} + +int nl_classifier_msg_recv(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[(NL_CLASSIFIER_ATTR_MAX+1)]; + + genlmsg_parse(nlh, NL_CLASSIFIER_GENL_HDRSIZE, attrs, NL_CLASSIFIER_ATTR_MAX, nl_classifier_genl_policy); + + switch (gnlh->cmd) { + case NL_CLASSIFIER_CMD_ACCEL_OK: + printf("Acceleration successful:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + case NL_CLASSIFIER_CMD_CONNECTION_CLOSED: + printf("Connection is closed:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + default: + printf("nl classifier received unknow message %d\n", gnlh->cmd); + } + + return NL_SKIP; +} + +void nl_classifier_offload(struct nl_classifier_instance *inst, + unsigned char proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short sport, + unsigned short dport, int af) +{ + struct nl_msg *msg; + int ret; + struct nl_classifier_tuple classifier_msg; + + memset(&classifier_msg, 0, sizeof(classifier_msg)); + classifier_msg.af = af; + classifier_msg.proto = proto; + memcpy(&classifier_msg.src_ip, src_saddr, (af == AF_INET ? 4 : 16)); + memcpy(&classifier_msg.dst_ip, dst_saddr, (af == AF_INET ? 4 : 16)); + classifier_msg.sport = sport; + classifier_msg.dport = dport; + + msg = nlmsg_alloc(); + if (!msg) { + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, inst->family_id, + NL_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + NL_CLASSIFIER_CMD_ACCEL, NL_CLASSIFIER_GENL_VERSION); + nla_put(msg, NL_CLASSIFIER_ATTR_TUPLE, sizeof(classifier_msg), &classifier_msg); + + ret = nl_send_auto(inst->sock, msg); + if (ret < 0) { + printf("send netlink message failed.\n"); + nlmsg_free(msg); + return; + } + + nlmsg_free(msg); + printf("nl classifier offload connection successful\n"); +} + +int nl_classifier_init(struct nl_classifier_instance *inst) +{ + int ret; + + inst->sock = nl_socket_alloc(); + if (!inst->sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(inst->sock); + + inst->family_id = genl_ctrl_resolve(inst->sock, NL_CLASSIFIER_GENL_FAMILY); + if (inst->family_id < 0) { + printf("Unable to resolve family %s\n", NL_CLASSIFIER_GENL_FAMILY); + goto init_failed; + } + + inst->group_id = genl_ctrl_resolve_grp(inst->sock, NL_CLASSIFIER_GENL_FAMILY, NL_CLASSIFIER_GENL_GROUP); + if (inst->group_id < 0) { + printf("Unable to resolve mcast group %s\n", NL_CLASSIFIER_GENL_GROUP); + goto init_failed; + } + + ret = nl_socket_add_membership(inst->sock, inst->group_id); + if (ret < 0) { + printf("Unable to add membership\n"); + goto init_failed; + } + + nl_socket_disable_seq_check(inst->sock); + nl_socket_modify_cb(inst->sock, NL_CB_VALID, NL_CB_CUSTOM, nl_classifier_msg_recv, NULL); + + printf("nl classifier init successful\n"); + return 0; + +init_failed: + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + return -1; +} + +void nl_classifier_exit(struct nl_classifier_instance *inst) +{ + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + printf("nl classifier exit successful\n"); +} + +int nl_classifier_parse_arg(int argc, char *argv[], unsigned char *proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short *sport, unsigned short *dport, int *af) +{ + int ret; + unsigned short port; + + if (argc < 7) { + printf("help: nl_classifier \n"); + return -1; + } + + if (0 == strncmp(argv[1], "v4", 2)) { + *af = AF_INET; + } else if (0 == strncmp(argv[1], "v6", 2)) { + *af = AF_INET6; + } else { + printf("Address family is not supported"); + return -1; + } + + if (0 == strncmp(argv[2], "udp", 3)) { + *proto = IPPROTO_UDP; + } else if (0 == strncmp(argv[2], "tcp", 3)) { + *proto = IPPROTO_TCP; + } else { + printf("Protocol is not supported"); + return -1; + } + + ret = inet_pton(*af, argv[3], src_saddr); + if (ret <= 0) { + printf("source ip has wrong format\n"); + return -1; + } + + ret = inet_pton(*af, argv[4], dst_saddr); + if (ret <= 0) { + printf("destination ip has wrong format\n"); + return -1; + } + + port = strtol(argv[5], NULL, 0); + *sport = htons(port); + port = strtol(argv[6], NULL, 0); + *dport = htons(port); + + printf("nl classifier parse arguments successful\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct nl_classifier_instance *inst = &nl_cls_inst; + unsigned char proto; + unsigned long src_addr[4]; + unsigned long dst_addr[4]; + unsigned short sport; + unsigned short dport; + int af; + int ret; + + ret = nl_classifier_parse_arg(argc, argv, &proto, src_addr, dst_addr, &sport, &dport, &af); + if (ret < 0) { + printf("Failed to parse arguments\n"); + return ret; + } + + ret = nl_classifier_init(inst); + if (ret < 0) { + printf("Unable to init generic netlink\n"); + return ret; + } + + nl_classifier_offload(inst, proto, src_addr, dst_addr, sport, dport, af); + + /* main loop to listen on message */ + while (!inst->stop) { + nl_recvmsgs_default(inst->sock); + } + + nl_classifier_exit(inst); + + return 0; +} diff --git a/shortcut-fe/fast-classifier/src/userspace_example.c b/shortcut-fe/fast-classifier/src/userspace_example.c new file mode 100644 index 000000000..4f4113d99 --- /dev/null +++ b/shortcut-fe/fast-classifier/src/userspace_example.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +static struct nl_sock *sock; +static struct nl_sock *sock_event; +static int family; +static int grp_id; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void dump_fc_tuple(struct fast_classifier_tuple *fc_msg) +{ + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; + + printf("TUPLE: %d, %s, %s, %d, %d" + " SMAC=%02x:%02x:%02x:%02x:%02x:%02x", + " DMAC=%02x:%02x:%02x:%02x:%02x:%02x\n", + fc_msg->proto, + inet_ntop(AF_INET, + &fc_msg->src_saddr.in.s_addr, + src_str, + INET_ADDRSTRLEN), + inet_ntop(AF_INET, + &fc_msg->dst_saddr.in.s_addr, + dst_str, + INET_ADDRSTRLEN), + fc_msg->sport, fc_msg->dport, + fc_msg->smac[0], fc_msg->smac[1], fc_msg->smac[2], + fc_msg->smac[3], fc_msg->smac[4], fc_msg->smac[5], + fc_msg->dmac[0], fc_msg->dmac[1], fc_msg->dmac[2], + fc_msg->dmac[3], fc_msg->dmac[4], fc_msg->dmac[5]); +} + +static int parse_cb(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[FAST_CLASSIFIER_A_MAX]; + + genlmsg_parse(nlh, 0, attrs, FAST_CLASSIFIER_A_MAX, fast_classifier_genl_policy); + + switch (gnlh->cmd) { + case FAST_CLASSIFIER_C_OFFLOADED: + printf("Got a offloaded message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + case FAST_CLASSIFIER_C_DONE: + printf("Got a done message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + } + + return NL_SKIP; +} + +int fast_classifier_init(void) +{ + int err; + + sock = nl_socket_alloc(); + if (!sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock); + + sock_event = nl_socket_alloc(); + if (!sock_event) { + nl_close(sock); + nl_socket_free(sock); + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock_event); + + family = genl_ctrl_resolve(sock, FAST_CLASSIFIER_GENL_NAME); + if (family < 0) { + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock); + nl_socket_free(sock_event); + printf("Unable to resolve family\n"); + return -1; + } + + grp_id = genl_ctrl_resolve_grp(sock, FAST_CLASSIFIER_GENL_NAME, + FAST_CLASSIFIER_GENL_MCGRP); + if (grp_id < 0) { + printf("Unable to resolve mcast group\n"); + return -1; + } + + err = nl_socket_add_membership(sock_event, grp_id); + if (err < 0) { + printf("Unable to add membership\n"); + return -1; + } + + nl_socket_disable_seq_check(sock_event); + nl_socket_modify_cb(sock_event, NL_CB_VALID, NL_CB_CUSTOM, parse_cb, NULL); + + return 0; +} + +void fast_classifier_close(void) +{ + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock_event); + nl_socket_free(sock); +} + +void fast_classifier_ipv4_offload(unsigned char proto, unsigned long src_saddr, + unsigned long dst_saddr, unsigned short sport, + unsigned short dport) +{ + struct nl_msg *msg; + int ret; +#ifdef DEBUG + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; +#endif + struct fast_classifier_tuple fc_msg; + +#ifdef DEBUG + printf("DEBUG: would offload: %d, %s, %s, %d, %d\n", proto, + inet_ntop(AF_INET, &src_saddr, src_str, INET_ADDRSTRLEN), + inet_ntop(AF_INET, &dst_saddr, dst_str, INET_ADDRSTRLEN), + sport, dport); +#endif + + fc_msg.proto = proto; + fc_msg.src_saddr.in.s_addr = src_saddr; + fc_msg.dst_saddr.in.s_addr = dst_saddr; + fc_msg.sport = sport; + fc_msg.dport = dport; + fc_msg.smac[0] = 'a'; + fc_msg.smac[1] = 'b'; + fc_msg.smac[2] = 'c'; + fc_msg.smac[3] = 'd'; + fc_msg.smac[4] = 'e'; + fc_msg.smac[5] = 'f'; + fc_msg.dmac[0] = 'f'; + fc_msg.dmac[1] = 'e'; + fc_msg.dmac[2] = 'd'; + fc_msg.dmac[3] = 'c'; + fc_msg.dmac[4] = 'b'; + fc_msg.dmac[5] = 'a'; + + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + msg = nlmsg_alloc(); + if (!msg) { + nl_socket_free(sock); + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, family, + FAST_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + FAST_CLASSIFIER_C_OFFLOAD, FAST_CLASSIFIER_GENL_VERSION); + nla_put(msg, 1, sizeof(fc_msg), &fc_msg); + + ret = nl_send_auto_complete(sock, msg); + + nlmsg_free(msg); + if (ret < 0) { + printf("nlmsg_free failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } + + ret = nl_wait_for_ack(sock); + if (ret < 0) { + printf("wait for ack failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } +} + +void fast_classifier_listen_for_messages(void) +{ + printf("waiting for netlink events\n"); + + while (1) { + nl_recvmsgs_default(sock_event); + } +} + +int main(int argc, char *argv[]) +{ + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + fast_classifier_ipv4_offload('a', 0, 0, 0, 0); + + /* this never returns */ + fast_classifier_listen_for_messages(); + + fast_classifier_close(); + + return 0; +} diff --git a/shortcut-fe/sfe.c b/shortcut-fe/sfe.c deleted file mode 100644 index 338ae96e4..000000000 --- a/shortcut-fe/sfe.c +++ /dev/null @@ -1,1838 +0,0 @@ -/* - * sfe.c - * API for shortcut forwarding engine. - * - * Copyright (c) 2015,2016, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_pppoe.h" -#include "sfe_pppoe_mgr.h" -#include "sfe_vlan.h" -#include "sfe_ipv4.h" -#include "sfe_ipv6.h" - -extern int max_ipv4_conn; -extern int max_ipv6_conn; - -#define SFE_MESSAGE_VERSION 0x1 -#define sfe_ipv6_addr_copy(src, dest) memcpy((void *)(dest), (void *)(src), 16) -#define sfe_ipv4_stopped(CTX) (rcu_dereference((CTX)->ipv4_stats_sync_cb) == NULL) -#define sfe_ipv6_stopped(CTX) (rcu_dereference((CTX)->ipv6_stats_sync_cb) == NULL) -#define SFE_IPSEC_TUNNEL_TYPE 31 - -typedef enum sfe_exception { - SFE_EXCEPTION_IPV4_MSG_UNKNOW, - SFE_EXCEPTION_IPV6_MSG_UNKNOW, - SFE_EXCEPTION_CONNECTION_INVALID, - SFE_EXCEPTION_NOT_SUPPORT_BRIDGE, - SFE_EXCEPTION_TCP_INVALID, - SFE_EXCEPTION_PROTOCOL_NOT_SUPPORT, - SFE_EXCEPTION_SRC_DEV_NOT_L3, - SFE_EXCEPTION_DEST_DEV_NOT_L3, - SFE_EXCEPTION_CFG_ERR, - SFE_EXCEPTION_CREATE_FAILED, - SFE_EXCEPTION_ENQUEUE_FAILED, - SFE_EXCEPTION_NOT_SUPPORT_6RD, - SFE_EXCEPTION_NO_SYNC_CB, - SFE_EXCEPTION_MAX -} sfe_exception_t; - -static char *sfe_exception_events_string[SFE_EXCEPTION_MAX] = { - "IPV4_MSG_UNKNOW", - "IPV6_MSG_UNKNOW", - "CONNECTION_INVALID", - "NOT_SUPPORT_BRIDGE", - "TCP_INVALID", - "PROTOCOL_NOT_SUPPORT", - "SRC_DEV_NOT_L3", - "DEST_DEV_NOT_L3", - "CONFIG_ERROR", - "CREATE_FAILED", - "ENQUEUE_FAILED", - "NOT_SUPPORT_6RD", - "NO_SYNC_CB" -}; - -/* - * Message type of queued response message - */ -typedef enum { - SFE_MSG_TYPE_IPV4, - SFE_MSG_TYPE_IPV6 -} sfe_msg_types_t; - -/* - * Queued response message, - * will be sent back to caller in workqueue - */ -struct sfe_response_msg { - struct list_head node; - sfe_msg_types_t type; - void *msg[0]; -}; - -/* - * SFE context instance, private for SFE - */ -struct sfe_ctx_instance_internal { - struct sfe_ctx_instance base; /* Exported SFE context, is public to user of SFE*/ - - /* - * Control state. - */ - struct kobject *sys_sfe; /* Sysfs linkage */ - - struct list_head msg_queue; /* Response message queue*/ - spinlock_t lock; /* Lock to protect message queue */ - - struct work_struct work; /* Work to send response message back to caller*/ - - sfe_ipv4_msg_callback_t __rcu ipv4_stats_sync_cb; /* Callback to call to sync ipv4 statistics */ - sfe_ipv4_msg_callback_t __rcu ipv4_stats_sync_many_cb; /* Callback to call to sync many ipv4 statistics */ - void *ipv4_stats_sync_data; /* Argument for above callback: ipv4_stats_sync_cb */ - - sfe_ipv6_msg_callback_t __rcu ipv6_stats_sync_cb; /* Callback to call to sync ipv6 statistics */ - sfe_ipv6_msg_callback_t __rcu ipv6_stats_sync_many_cb; /* Callback to call to sync many ipv6 statistics */ - void *ipv6_stats_sync_data; /* Argument for above callback: ipv6_stats_sync_cb */ - - u32 exceptions[SFE_EXCEPTION_MAX]; /* Statistics for exception */ - - int32_t l2_feature_support; /* L2 feature support */ - -}; - -static struct sfe_ctx_instance_internal __sfe_ctx; - -/* - * Convert public SFE context to internal context - */ -#define SFE_CTX_TO_PRIVATE(base) (struct sfe_ctx_instance_internal *)(base) -/* - * Convert internal SFE context to public context - */ -#define SFE_CTX_TO_PUBLIC(intrv) (struct sfe_ctx_instance *)(intrv) - -/* - * sfe_incr_exceptions() - * Increase an exception counter. - * - * TODO: Merge sfe_ctx stats to ipv4 and ipv6 percpu stats. - */ -static inline void sfe_incr_exceptions(sfe_exception_t except) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - spin_lock_bh(&sfe_ctx->lock); - sfe_ctx->exceptions[except]++; - spin_unlock_bh(&sfe_ctx->lock); -} - -/* - * sfe_dev_is_layer_3_interface() - * Check if a network device is ipv4 or ipv6 layer 3 interface - * - * @param dev network device to check - * @param check_v4 check ipv4 layer 3 interface(which have ipv4 address) or ipv6 layer 3 interface(which have ipv6 address) - */ -inline bool sfe_dev_is_layer_3_interface(struct net_device *dev, bool check_v4) -{ - struct in_device *in4_dev; - struct inet6_dev *in6_dev; - - BUG_ON(!dev); - - if (likely(check_v4)) { - /* - * Does our input device support IPv4 processing? - */ - in4_dev = (struct in_device *)dev->ip_ptr; - if (unlikely(!in4_dev)) { - return false; - } - - /* - * Does it have an IPv4 address? If it doesn't then it - * could be map-t, dslite or tun6rd interface, otherwise we - * can't do anything interesting here! - */ - if (likely(in4_dev->ifa_list || (dev->priv_flags_ext & IFF_EXT_MAPT) - || (dev->rtnl_link_ops - && (!strcmp(dev->rtnl_link_ops->kind, "ip6tnl") - || !strcmp(dev->rtnl_link_ops->kind, "sit"))))) { - return true; - } - return false; - } - - /* - * Does our input device support IPv6 processing? - */ - in6_dev = (struct inet6_dev *)dev->ip6_ptr; - if (unlikely(!in6_dev)) { - return false; - } - - /* - * Does it have an IPv6 address? If it doesn't then it could be MAP-T interface, - * else we can't do anything interesting here! - */ - if (likely(!list_empty(&in6_dev->addr_list) || (dev->priv_flags_ext & IFF_EXT_MAPT))) { - return true; - } - - return false; -} - -/* - * sfe_routed_dev_allow() - * check whether routed acceleration allowed - */ -static bool sfe_routed_dev_allow(struct net_device *dev, bool is_routed, bool check_v4) -{ - if (!is_routed) { - return true; - } - - if (sfe_dev_is_layer_3_interface(dev, check_v4)) { - return true; - } - - /* - * in case of GRE / vxlan, these dev does not have IP address - * so l3 interface check will fail. allow rule creation between gre / vxlan - * and wan dev for routed flow. - */ - if (netif_is_vxlan(dev)) { - return true; - } - -#ifdef SFE_GRE_TUN_ENABLE - if (netif_is_gretap(dev) || netif_is_gre(dev)) { - return true; - } - - if (netif_is_ip6gre(dev) || netif_is_ip6gretap(dev)) { - return true; - } -#endif - - if (dev->type == SFE_IPSEC_TUNNEL_TYPE) { - return true; - } - - return false; -} - -/* sfe_dev_has_hw_csum() - * check whether device supports hardware checksum offload - */ -bool sfe_dev_has_hw_csum(struct net_device *dev) -{ - if (netif_is_vxlan(dev)) { - return false; - } - -#ifdef SFE_GRE_TUN_ENABLE - if (netif_is_gre(dev) || netif_is_gretap(dev)) { - return false; - } - - if (netif_is_ip6gre(dev) || netif_is_ip6gretap(dev)) { - return false; - } -#endif - /* - * Tunnel MAP-E/DS-LITE and Tun6rd share the same Routing netlink operator - * whose kind is "ip6tnl". The HW csum for these tunnel devices should be disabled. - */ - if (dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6tnl")) { - return false; - } - - if (dev->type == SFE_IPSEC_TUNNEL_TYPE) { - return false; - } - - return true; -} - -/* - * sfe_clean_response_msg_by_type() - * clean response message in queue when ECM exit - * - * @param sfe_ctx SFE context - * @param msg_type message type, ipv4 or ipv6 - */ -static void sfe_clean_response_msg_by_type(struct sfe_ctx_instance_internal *sfe_ctx, sfe_msg_types_t msg_type) -{ - struct sfe_response_msg *response, *tmp; - - if (!sfe_ctx) { - return; - } - - spin_lock_bh(&sfe_ctx->lock); - list_for_each_entry_safe(response, tmp, &sfe_ctx->msg_queue, node) { - if (response->type == msg_type) { - list_del(&response->node); - /* - * Free response message - */ - kfree(response); - } - } - spin_unlock_bh(&sfe_ctx->lock); - -} - -/* - * sfe_process_response_msg() - * Send all pending response message to ECM by calling callback function included in message - * - * @param work work structure - */ -static void sfe_process_response_msg(struct work_struct *work) -{ - struct sfe_ctx_instance_internal *sfe_ctx = container_of(work, struct sfe_ctx_instance_internal, work); - struct sfe_response_msg *response; - - spin_lock_bh(&sfe_ctx->lock); - while ((response = list_first_entry_or_null(&sfe_ctx->msg_queue, struct sfe_response_msg, node))) { - list_del(&response->node); - spin_unlock_bh(&sfe_ctx->lock); - rcu_read_lock(); - - /* - * Send response message back to caller - */ - if ((response->type == SFE_MSG_TYPE_IPV4) && !sfe_ipv4_stopped(sfe_ctx)) { - struct sfe_ipv4_msg *msg = (struct sfe_ipv4_msg *)response->msg; - sfe_ipv4_msg_callback_t callback = (sfe_ipv4_msg_callback_t)msg->cm.cb; - if (callback) { - callback((void *)msg->cm.app_data, msg); - } - } else if ((response->type == SFE_MSG_TYPE_IPV6) && !sfe_ipv6_stopped(sfe_ctx)) { - struct sfe_ipv6_msg *msg = (struct sfe_ipv6_msg *)response->msg; - sfe_ipv6_msg_callback_t callback = (sfe_ipv6_msg_callback_t)msg->cm.cb; - if (callback) { - callback((void *)msg->cm.app_data, msg); - } - } - - rcu_read_unlock(); - /* - * Free response message - */ - kfree(response); - spin_lock_bh(&sfe_ctx->lock); - } - spin_unlock_bh(&sfe_ctx->lock); -} - -/* - * sfe_alloc_response_msg() - * Alloc and construct new response message - * - * @param type message type - * @param msg used to construct response message if not NULL - * - * @return !NULL, success; NULL, failed - */ -static struct sfe_response_msg * -sfe_alloc_response_msg(sfe_msg_types_t type, void *msg) -{ - struct sfe_response_msg *response; - int size; - - switch (type) { - case SFE_MSG_TYPE_IPV4: - size = sizeof(struct sfe_ipv4_msg); - break; - case SFE_MSG_TYPE_IPV6: - size = sizeof(struct sfe_ipv6_msg); - break; - default: - DEBUG_ERROR("message type %d not supported\n", type); - return NULL; - } - - response = (struct sfe_response_msg *)kzalloc(sizeof(struct sfe_response_msg) + size, GFP_ATOMIC); - if (!response) { - DEBUG_ERROR("allocate memory failed\n"); - return NULL; - } - - response->type = type; - - if (msg) { - memcpy(response->msg, msg, size); - } - - return response; -} - -/* - * sfe_fast_xmit_check() - * Check the fast transmit feasibility. - * - * This check the per direction's attribute that could not go fast - * transmit - * xfrm packets, come from a local socket or need sk validation on the skb - */ -bool sfe_fast_xmit_check(struct sk_buff *skb, netdev_features_t features) -{ - -#ifdef CONFIG_SOCK_VALIDATE_XMIT - if (skb->sk && sk_fullsock(skb->sk) && skb->sk->sk_validate_xmit_skb) { - DEBUG_INFO("%px:need sk validation\n", skb); - return false; -#ifdef CONFIG_TLS_DEVICE - } else if (skb->decrypted) { - DEBUG_INFO("%px:SK or decrypted\n", skb); - return false; -#endif - } -#endif - if (skb_vlan_tag_present(skb)) { - DEBUG_INFO("%px:Vlan is present\n", skb); - return false; - } - - if (netif_needs_gso(skb, features)) { - DEBUG_INFO("%px:Need to be gso\n", skb); - return false; - } - - if (skb_sec_path(skb)) { - DEBUG_INFO("%px:XFRM is present\n", skb); - return false; - } - - return true; -} - -/* - * sfe_enqueue_msg() - * Queue response message - * - * @param sfe_ctx SFE context - * @param response response message to be queue - */ -static inline void sfe_enqueue_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_response_msg *response) -{ - spin_lock_bh(&sfe_ctx->lock); - list_add_tail(&response->node, &sfe_ctx->msg_queue); - spin_unlock_bh(&sfe_ctx->lock); - - schedule_work(&sfe_ctx->work); -} - -/* - * sfe_cmn_msg_init() - * Initialize the common message structure. - * - * @param ncm message to init - * @param if_num interface number related with this message - * @param type message type - * @param cb callback function to process repsonse of this message - * @param app_data argument for above callback function - */ -static void sfe_cmn_msg_init(struct sfe_cmn_msg *ncm, u16 if_num, u32 type, u32 len, void *cb, void *app_data) -{ - ncm->interface = if_num; - ncm->version = SFE_MESSAGE_VERSION; - ncm->type = type; - ncm->len = len; - ncm->cb = (sfe_ptr_t)cb; - ncm->app_data = (sfe_ptr_t)app_data; -} - -/* - * sfe_ipv4_stats_many_sync_callback() - * Synchronize many connection's state. - * - * @param SFE statistics from SFE core engine - */ -static void sfe_ipv4_stats_many_sync_callback(struct sfe_ipv4_msg *msg) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - sfe_ipv4_msg_callback_t sync_cb; - - rcu_read_lock(); - sync_cb = rcu_dereference(sfe_ctx->ipv4_stats_sync_many_cb); - rcu_read_unlock(); - if (!sync_cb) { - sfe_incr_exceptions(SFE_EXCEPTION_NO_SYNC_CB); - return; - } - sync_cb(sfe_ctx->ipv4_stats_sync_data, msg); -} - -/* - * sfe_ipv4_stats_convert() - * Convert the internal message format to ecm format. - * - * @param sync_msg stat msg to ecm - * @param sis SFE statistics from SFE core engine - */ -void sfe_ipv4_stats_convert(struct sfe_ipv4_conn_sync *sync_msg, struct sfe_connection_sync *sis) -{ - /* - * Fill connection specific information - */ - sync_msg->protocol = (u8)sis->protocol; - sync_msg->flow_ip = sis->src_ip.ip; - sync_msg->flow_ip_xlate = sis->src_ip_xlate.ip; - sync_msg->flow_ident = sis->src_port; - sync_msg->flow_ident_xlate = sis->src_port_xlate; - - sync_msg->return_ip = sis->dest_ip.ip; - sync_msg->return_ip_xlate = sis->dest_ip_xlate.ip; - sync_msg->return_ident = sis->dest_port; - sync_msg->return_ident_xlate = sis->dest_port_xlate; - - /* - * Fill TCP protocol specific information - */ - if (sis->protocol == IPPROTO_TCP) { - sync_msg->flow_max_window = sis->src_td_max_window; - sync_msg->flow_end = sis->src_td_end; - sync_msg->flow_max_end = sis->src_td_max_end; - - sync_msg->return_max_window = sis->dest_td_max_window; - sync_msg->return_end = sis->dest_td_end; - sync_msg->return_max_end = sis->dest_td_max_end; - } - - /* - * Fill statistics information - */ - sync_msg->flow_rx_packet_count = sis->src_new_packet_count; - sync_msg->flow_rx_byte_count = sis->src_new_byte_count; - sync_msg->flow_tx_packet_count = sis->dest_new_packet_count; - sync_msg->flow_tx_byte_count = sis->dest_new_byte_count; - - sync_msg->return_rx_packet_count = sis->dest_new_packet_count; - sync_msg->return_rx_byte_count = sis->dest_new_byte_count; - sync_msg->return_tx_packet_count = sis->src_new_packet_count; - sync_msg->return_tx_byte_count = sis->src_new_byte_count; - - /* - * Fill expiration time to extend, in unit of msec - */ - sync_msg->inc_ticks = (((u32)sis->delta_jiffies) * MSEC_PER_SEC)/HZ; - - /* - * Fill other information - */ - switch (sis->reason) { - case SFE_SYNC_REASON_DESTROY: - sync_msg->reason = SFE_RULE_SYNC_REASON_DESTROY; - break; - case SFE_SYNC_REASON_FLUSH: - sync_msg->reason = SFE_RULE_SYNC_REASON_FLUSH; - break; - default: - sync_msg->reason = SFE_RULE_SYNC_REASON_STATS; - break; - } - return; -} - -/* - * sfe_ipv4_stats_one_sync_callback() - * Synchronize a connection's state. - * - * @param sis SFE statistics from SFE core engine - */ -static void sfe_ipv4_stats_one_sync_callback(struct sfe_connection_sync *sis) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - struct sfe_ipv4_msg msg; - struct sfe_ipv4_conn_sync *sync_msg; - sfe_ipv4_msg_callback_t sync_cb; - - rcu_read_lock(); - sync_cb = rcu_dereference(sfe_ctx->ipv4_stats_sync_cb); - rcu_read_unlock(); - if (!sync_cb) { - sfe_incr_exceptions(SFE_EXCEPTION_NO_SYNC_CB); - return; - } - - sync_msg = &msg.msg.conn_stats; - - memset(&msg, 0, sizeof(msg)); - sfe_cmn_msg_init(&msg.cm, 0, SFE_RX_CONN_STATS_SYNC_MSG, - sizeof(struct sfe_ipv4_conn_sync), NULL, NULL); - - sfe_ipv4_stats_convert(sync_msg, sis); - - /* - * SFE sync calling is excuted in a timer, so we can redirect it to ECM directly. - */ - sync_cb(sfe_ctx->ipv4_stats_sync_data, &msg); -} - -/* - * sfe_recv_parse_l2() - * Parse L2 headers - * - * Returns true if the packet is parsed and false otherwise. - */ -static bool sfe_recv_parse_l2(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info) -{ - /* - * VLAN parsing - */ - if (unlikely(!sfe_vlan_check_and_parse_tag(skb, l2_info))) { - return false; - } - - /* - * Parse only PPPoE session packets - */ - if (htons(ETH_P_PPP_SES) == skb->protocol) { - if (!sfe_pppoe_parse_hdr(skb, l2_info)) { - - /* - * For exception from PPPoE return from here without modifying the skb->data - * This includes non-IPv4/v6 cases also - */ - return false; - } - } - return true; -} - -/* - * sfe_recv_undo_parse_l2() - */ -static void sfe_recv_undo_parse_l2(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info) -{ - /* - * PPPoE undo - */ - sfe_pppoe_undo_parse(skb, l2_info); - - /* - * VLAN undo - */ - sfe_vlan_undo_parse(skb, l2_info); - - /* - * packet is not handled by SFE, so reset the network header - */ - skb_reset_network_header(skb); -} - -/* - * sfe_create_ipv4_rule_msg() - * Convert create message format from ecm to sfe - * - * @param sfe_ctx SFE context - * @param msg The IPv4 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_create_ipv4_rule_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_ipv4_msg *msg) -{ - struct net_device *src_dev = NULL; - struct net_device *dest_dev = NULL; - struct sfe_response_msg *response; - enum sfe_cmn_response ret = SFE_TX_SUCCESS; - bool is_routed = true; - bool cfg_err; - - response = sfe_alloc_response_msg(SFE_MSG_TYPE_IPV4, msg); - if (!response) { - sfe_incr_exceptions(SFE_EXCEPTION_ENQUEUE_FAILED); - return SFE_TX_FAILURE_QUEUE; - } - - if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_CONN_VALID)) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CONNECTION_INVALID); - goto failed_ret; - } - - switch (msg->msg.rule_create.tuple.protocol) { - case IPPROTO_TCP: - if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_TCP_VALID)) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_TCP_INVALID); - goto failed_ret; - } - break; - - case IPPROTO_UDP: - break; - - case IPPROTO_GRE: - break; - - case IPPROTO_IPV6: - break; - - case IPPROTO_ESP: - break; - - case IPPROTO_RAW: - /* - * for accelerating PPPoE bridged flows using 3-tuple information - */ - break; - - default: - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_PROTOCOL_NOT_SUPPORT); - goto failed_ret; - } - - /* - * Bridge flows are accelerated if L2 feature is enabled. - */ - if (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - if (!sfe_is_l2_feature_enabled()) { - ret = SFE_CMN_RESPONSE_EINTERFACE; - sfe_incr_exceptions(SFE_EXCEPTION_NOT_SUPPORT_BRIDGE); - goto failed_ret; - } - - is_routed = false; - } - - /* - * Does our input device support IP processing? - */ - src_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.flow_top_interface_num); - if (!src_dev || !sfe_routed_dev_allow(src_dev, is_routed, true)) { - ret = SFE_CMN_RESPONSE_EINTERFACE; - sfe_incr_exceptions(SFE_EXCEPTION_SRC_DEV_NOT_L3); - goto failed_ret; - } - - /* - * Check whether L2 feature is disabled and rule flag is configured to use bottom interface - */ - cfg_err = (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE) && !sfe_is_l2_feature_enabled(); - if (cfg_err) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CFG_ERR); - goto failed_ret; - } - - /* - * Does our output device support IP processing? - */ - dest_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.return_top_interface_num); - if (!dest_dev || !sfe_routed_dev_allow(dest_dev, is_routed, true)) { - ret = SFE_CMN_RESPONSE_EINTERFACE; - sfe_incr_exceptions(SFE_EXCEPTION_DEST_DEV_NOT_L3); - goto failed_ret; - } - - /* - * Check whether L2 feature is disabled and rule flag is configured to use bottom interface - */ - cfg_err = (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE) && !sfe_is_l2_feature_enabled(); - if (cfg_err) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CFG_ERR); - goto failed_ret; - } - - if (!sfe_ipv4_create_rule(&msg->msg.rule_create)) { - /* success */ - ret = SFE_CMN_RESPONSE_ACK; - } else { - /* Failed */ - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CREATE_FAILED); - } - - /* - * Fall through - */ -failed_ret: - if (src_dev) { - dev_put(src_dev); - } - - if (dest_dev) { - dev_put(dest_dev); - } - - /* - * Try to queue response message - */ - ((struct sfe_ipv4_msg *)response->msg)->cm.response = msg->cm.response = ret; - sfe_enqueue_msg(sfe_ctx, response); - - return SFE_TX_SUCCESS; -} - -/* - * sfe_destroy_ipv4_rule_msg() - * Convert destroy message format from ecm to sfe - * - * @param sfe_ctx SFE context - * @param msg The IPv4 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_destroy_ipv4_rule_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_ipv4_msg *msg) -{ - struct sfe_response_msg *response; - - response = sfe_alloc_response_msg(SFE_MSG_TYPE_IPV4, msg); - if (!response) { - sfe_incr_exceptions(SFE_EXCEPTION_ENQUEUE_FAILED); - return SFE_TX_FAILURE_QUEUE; - } - - sfe_ipv4_destroy_rule(&msg->msg.rule_destroy); - - /* - * Try to queue response message - */ - ((struct sfe_ipv4_msg *)response->msg)->cm.response = msg->cm.response = SFE_CMN_RESPONSE_ACK; - sfe_enqueue_msg(sfe_ctx, response); - - return SFE_TX_SUCCESS; -} - -/* - * sfe_sync_ipv4_stats_many_msg() - * sync con stats msg from the ecm - * - * @param sfe_ctx SFE context - * @param msg The IPv4 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_sync_ipv4_stats_many_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_ipv4_msg *msg) -{ - struct sfe_ipv4_conn_sync_many_msg *nicsm; - nicsm = &(msg->msg.conn_stats_many); - - if (sfe_ipv4_sync_invoke(nicsm->index)) { - return SFE_TX_SUCCESS; - } - return SFE_TX_FAILURE; -} - -/* - * sfe_ipv4_tx() - * Transmit an IPv4 message to the sfe - * - * @param sfe_ctx SFE context - * @param msg The IPv4 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_ipv4_tx(struct sfe_ctx_instance *sfe_ctx, struct sfe_ipv4_msg *msg) -{ - switch (msg->cm.type) { - case SFE_TX_CREATE_RULE_MSG: - return sfe_create_ipv4_rule_msg(SFE_CTX_TO_PRIVATE(sfe_ctx), msg); - case SFE_TX_DESTROY_RULE_MSG: - return sfe_destroy_ipv4_rule_msg(SFE_CTX_TO_PRIVATE(sfe_ctx), msg); - case SFE_TX_CONN_STATS_SYNC_MANY_MSG: - return sfe_sync_ipv4_stats_many_msg(SFE_CTX_TO_PRIVATE(sfe_ctx),msg); - default: - sfe_incr_exceptions(SFE_EXCEPTION_IPV4_MSG_UNKNOW); - return SFE_TX_FAILURE_NOT_ENABLED; - } -} -EXPORT_SYMBOL(sfe_ipv4_tx); - -/* - * sfe_ipv4_msg_init() - * Initialize IPv4 message. - */ -void sfe_ipv4_msg_init(struct sfe_ipv4_msg *nim, u16 if_num, u32 type, u32 len, - sfe_ipv4_msg_callback_t cb, void *app_data) -{ - sfe_cmn_msg_init(&nim->cm, if_num, type, len, (void *)cb, app_data); -} -EXPORT_SYMBOL(sfe_ipv4_msg_init); - -/* - * sfe_ipv4_max_conn_count() - * Return maximum number of entries SFE supported - */ -int sfe_ipv4_max_conn_count(void) -{ - return max_ipv4_conn; -} -EXPORT_SYMBOL(sfe_ipv4_max_conn_count); - -/* - * sfe_ipv4_notify_register() - * Register a notifier callback for IPv4 messages from SFE - * - * @param cb The callback pointer - * @param app_data The application context for this message - * - * @return struct sfe_ctx_instance * The SFE context - */ -struct sfe_ctx_instance *sfe_ipv4_notify_register(sfe_ipv4_msg_callback_t one_rule_cb, - sfe_ipv4_msg_callback_t many_rules_cb,void *app_data) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - spin_lock_bh(&sfe_ctx->lock); - /* - * Hook the shortcut sync callback. - */ - if (one_rule_cb && !sfe_ctx->ipv4_stats_sync_cb) { - sfe_ipv4_register_sync_rule_callback(sfe_ipv4_stats_one_sync_callback); - } - rcu_assign_pointer(sfe_ctx->ipv4_stats_sync_cb, one_rule_cb); - - if (many_rules_cb && !sfe_ctx->ipv4_stats_sync_many_cb) { - sfe_ipv4_register_many_sync_callback(sfe_ipv4_stats_many_sync_callback); - } - rcu_assign_pointer(sfe_ctx->ipv4_stats_sync_many_cb, many_rules_cb); - - sfe_ctx->ipv4_stats_sync_data = app_data; - - spin_unlock_bh(&sfe_ctx->lock); - - return SFE_CTX_TO_PUBLIC(sfe_ctx); -} -EXPORT_SYMBOL(sfe_ipv4_notify_register); - -/* - * sfe_ipv4_notify_unregister() - * Un-Register the notifier callback for IPv4 messages from SFE - */ -void sfe_ipv4_notify_unregister(void) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - spin_lock_bh(&sfe_ctx->lock); - - /* - * Unregister our single rule msg sync callback. - */ - if (sfe_ctx->ipv4_stats_sync_cb) { - sfe_ipv4_register_sync_rule_callback(NULL); - rcu_assign_pointer(sfe_ctx->ipv4_stats_sync_cb, NULL); - } - - /* - * Unregister our many rule msg sync callback. - */ - if (sfe_ctx->ipv4_stats_sync_many_cb) { - sfe_ipv4_register_many_sync_callback(NULL); - rcu_assign_pointer(sfe_ctx->ipv4_stats_sync_many_cb, NULL); - } - - sfe_ctx->ipv4_stats_sync_data = NULL; - - spin_unlock_bh(&sfe_ctx->lock); - - sfe_clean_response_msg_by_type(sfe_ctx, SFE_MSG_TYPE_IPV4); - return; -} -EXPORT_SYMBOL(sfe_ipv4_notify_unregister); - -/* - * sfe_ipv6_many_stats_sync_callback() - * Synchronize many connection's state. - */ -static void sfe_ipv6_many_stats_sync_callback(struct sfe_ipv6_msg *msg) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - sfe_ipv6_msg_callback_t sync_cb; - - rcu_read_lock(); - sync_cb = rcu_dereference(sfe_ctx->ipv6_stats_sync_many_cb); - rcu_read_unlock(); - if (!sync_cb) { - sfe_incr_exceptions(SFE_EXCEPTION_NO_SYNC_CB); - return; - } - - sync_cb(sfe_ctx->ipv6_stats_sync_data, msg); -} - -/* - * sfe_ipv6_stats_convert() - * Convert the internal message format to ecm format. - * - * @param sync_msg stat msg to ecm - * @param sis SFE statistics from SFE core engine - */ -void sfe_ipv6_stats_convert(struct sfe_ipv6_conn_sync *sync_msg, struct sfe_connection_sync *sis) -{ - /* - * Fill connection specific information - */ - sync_msg->protocol = (u8)sis->protocol; - sfe_ipv6_addr_copy(sis->src_ip.ip6, sync_msg->flow_ip); - sync_msg->flow_ident = sis->src_port; - - sfe_ipv6_addr_copy(sis->dest_ip.ip6, sync_msg->return_ip); - sync_msg->return_ident = sis->dest_port; - - /* - * Fill TCP protocol specific information - */ - if (sis->protocol == IPPROTO_TCP) { - sync_msg->flow_max_window = sis->src_td_max_window; - sync_msg->flow_end = sis->src_td_end; - sync_msg->flow_max_end = sis->src_td_max_end; - - sync_msg->return_max_window = sis->dest_td_max_window; - sync_msg->return_end = sis->dest_td_end; - sync_msg->return_max_end = sis->dest_td_max_end; - } - - /* - * Fill statistics information - */ - sync_msg->flow_rx_packet_count = sis->src_new_packet_count; - sync_msg->flow_rx_byte_count = sis->src_new_byte_count; - sync_msg->flow_tx_packet_count = sis->dest_new_packet_count; - sync_msg->flow_tx_byte_count = sis->dest_new_byte_count; - - sync_msg->return_rx_packet_count = sis->dest_new_packet_count; - sync_msg->return_rx_byte_count = sis->dest_new_byte_count; - sync_msg->return_tx_packet_count = sis->src_new_packet_count; - sync_msg->return_tx_byte_count = sis->src_new_byte_count; - - /* - * Fill expiration time to extend, in unit of msec - */ - sync_msg->inc_ticks = (((u32)sis->delta_jiffies) * MSEC_PER_SEC)/HZ; - - /* - * Fill other information - */ - switch (sis->reason) { - case SFE_SYNC_REASON_DESTROY: - sync_msg->reason = SFE_RULE_SYNC_REASON_DESTROY; - break; - case SFE_SYNC_REASON_FLUSH: - sync_msg->reason = SFE_RULE_SYNC_REASON_FLUSH; - break; - default: - sync_msg->reason = SFE_RULE_SYNC_REASON_STATS; - break; - } - - return; -} - -/* - * sfe_ipv6_stats_sync_callback() - * Synchronize a connection's state. - */ -static void sfe_ipv6_stats_sync_callback(struct sfe_connection_sync *sis) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - struct sfe_ipv6_msg msg; - struct sfe_ipv6_conn_sync *sync_msg; - sfe_ipv6_msg_callback_t sync_cb; - - rcu_read_lock(); - sync_cb = rcu_dereference(sfe_ctx->ipv6_stats_sync_cb); - rcu_read_unlock(); - if (!sync_cb) { - sfe_incr_exceptions(SFE_EXCEPTION_NO_SYNC_CB); - return; - } - - sync_msg = &msg.msg.conn_stats; - - memset(&msg, 0, sizeof(msg)); - sfe_cmn_msg_init(&msg.cm, 0, SFE_RX_CONN_STATS_SYNC_MSG, - sizeof(struct sfe_ipv6_conn_sync), NULL, NULL); - - sfe_ipv6_stats_convert(sync_msg, sis); - - /* - * SFE sync calling is excuted in a timer, so we can redirect it to ECM directly. - */ - sync_cb(sfe_ctx->ipv6_stats_sync_data, &msg); -} - -/* - * sfe_create_ipv6_rule_msg() - * convert create message format from ecm to sfe - * - * @param sfe_ctx SFE context - * @param msg The IPv6 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_create_ipv6_rule_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_ipv6_msg *msg) -{ - struct net_device *src_dev = NULL; - struct net_device *dest_dev = NULL; - struct sfe_response_msg *response; - enum sfe_cmn_response ret = SFE_TX_SUCCESS; - bool is_routed = true; - bool cfg_err; - - response = sfe_alloc_response_msg(SFE_MSG_TYPE_IPV6, msg); - if (!response) { - sfe_incr_exceptions(SFE_EXCEPTION_ENQUEUE_FAILED); - return SFE_TX_FAILURE_QUEUE; - } - - if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_CONN_VALID)) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CONNECTION_INVALID); - goto failed_ret; - } - - /* - * Bridge flows are accelerated if L2 feature is enabled. - */ - if (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - if (!sfe_is_l2_feature_enabled()) { - ret = SFE_CMN_RESPONSE_EINTERFACE; - sfe_incr_exceptions(SFE_EXCEPTION_NOT_SUPPORT_BRIDGE); - goto failed_ret; - } - is_routed = false; - } - - switch(msg->msg.rule_create.tuple.protocol) { - - case IPPROTO_TCP: - if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_TCP_VALID)) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_TCP_INVALID); - goto failed_ret; - } - - break; - - case IPPROTO_UDP: - break; - - case IPPROTO_IPIP: - break; - - case IPPROTO_GRE: - break; - - case IPPROTO_ESP: - break; - - case IPPROTO_RAW: - /* - * for accelerating PPPoE bridged flows using 3-tuple information - */ - break; - - default: - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_PROTOCOL_NOT_SUPPORT); - goto failed_ret; - } - - /* - * Does our input device support IP processing? - */ - src_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.flow_top_interface_num); - if (!src_dev || !sfe_routed_dev_allow(src_dev, is_routed, false)) { - ret = SFE_CMN_RESPONSE_EINTERFACE; - sfe_incr_exceptions(SFE_EXCEPTION_SRC_DEV_NOT_L3); - goto failed_ret; - } - - /* - * Check whether L2 feature is disabled and rule flag is configured to use bottom interface - */ - cfg_err = (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE) && !sfe_is_l2_feature_enabled(); - if (cfg_err) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CFG_ERR); - goto failed_ret; - } - - /* - * Does our output device support IP processing? - */ - dest_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.return_top_interface_num); - if (!dest_dev || !sfe_routed_dev_allow(dest_dev, is_routed, false)) { - ret = SFE_CMN_RESPONSE_EINTERFACE; - sfe_incr_exceptions(SFE_EXCEPTION_DEST_DEV_NOT_L3); - goto failed_ret; - } - - /* - * Check whether L2 feature is disabled and rule flag is configured to use bottom interface - */ - cfg_err = (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE) && !sfe_is_l2_feature_enabled(); - if (cfg_err) { - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CFG_ERR); - goto failed_ret; - } - - if (!sfe_ipv6_create_rule(&msg->msg.rule_create)) { - /* success */ - ret = SFE_CMN_RESPONSE_ACK; - } else { - /* Failed */ - ret = SFE_CMN_RESPONSE_EMSG; - sfe_incr_exceptions(SFE_EXCEPTION_CREATE_FAILED); - } - - /* - * Fall through - */ -failed_ret: - if (src_dev) { - dev_put(src_dev); - } - - if (dest_dev) { - dev_put(dest_dev); - } - - /* - * Try to queue response message - */ - ((struct sfe_ipv6_msg *)response->msg)->cm.response = msg->cm.response = ret; - sfe_enqueue_msg(sfe_ctx, response); - - return SFE_TX_SUCCESS; -} - -/* - * sfe_destroy_ipv6_rule_msg() - * Convert destroy message format from ecm to sfe - * - * @param sfe_ctx SFE context - * @param msg The IPv6 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_destroy_ipv6_rule_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_ipv6_msg *msg) -{ - struct sfe_response_msg *response; - - response = sfe_alloc_response_msg(SFE_MSG_TYPE_IPV6, msg); - if (!response) { - sfe_incr_exceptions(SFE_EXCEPTION_ENQUEUE_FAILED); - return SFE_TX_FAILURE_QUEUE; - } - - sfe_ipv6_destroy_rule(&msg->msg.rule_destroy); - - /* - * Try to queue response message - */ - ((struct sfe_ipv6_msg *)response->msg)->cm.response = msg->cm.response = SFE_CMN_RESPONSE_ACK; - sfe_enqueue_msg(sfe_ctx, response); - - return SFE_TX_SUCCESS; -} - -/* - * sfe_sync_ipv6_stats_many_msg() - * sync con stats msg from the ecm - * - * @param sfe_ctx SFE context - * @param msg The IPv6 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_sync_ipv6_stats_many_msg(struct sfe_ctx_instance_internal *sfe_ctx, struct sfe_ipv6_msg *msg) -{ - struct sfe_ipv6_conn_sync_many_msg *nicsm; - nicsm = &(msg->msg.conn_stats_many); - - if (sfe_ipv6_sync_invoke(nicsm->index)) { - return SFE_TX_SUCCESS; - } - return SFE_TX_FAILURE; -} - -/* - * sfe_ipv6_tx() - * Transmit an IPv6 message to the sfe - * - * @param sfe_ctx SFE context - * @param msg The IPv6 message - * - * @return sfe_tx_status_t The status of the Tx operation - */ -sfe_tx_status_t sfe_ipv6_tx(struct sfe_ctx_instance *sfe_ctx, struct sfe_ipv6_msg *msg) -{ - switch (msg->cm.type) { - case SFE_TX_CREATE_RULE_MSG: - return sfe_create_ipv6_rule_msg(SFE_CTX_TO_PRIVATE(sfe_ctx), msg); - case SFE_TX_DESTROY_RULE_MSG: - return sfe_destroy_ipv6_rule_msg(SFE_CTX_TO_PRIVATE(sfe_ctx), msg); - case SFE_TX_CONN_STATS_SYNC_MANY_MSG: - return sfe_sync_ipv6_stats_many_msg(SFE_CTX_TO_PRIVATE(sfe_ctx), msg); - default: - sfe_incr_exceptions(SFE_EXCEPTION_IPV6_MSG_UNKNOW); - return SFE_TX_FAILURE_NOT_ENABLED; - } -} -EXPORT_SYMBOL(sfe_ipv6_tx); - -/* - * sfe_ipv6_msg_init() - * Initialize IPv6 message. - */ -void sfe_ipv6_msg_init(struct sfe_ipv6_msg *nim, u16 if_num, u32 type, u32 len, - sfe_ipv6_msg_callback_t cb, void *app_data) -{ - sfe_cmn_msg_init(&nim->cm, if_num, type, len, (void *)cb, app_data); -} -EXPORT_SYMBOL(sfe_ipv6_msg_init); - -/* - * sfe_ipv6_max_conn_count() - * Return maximum number of entries SFE supported - */ -int sfe_ipv6_max_conn_count(void) -{ - return max_ipv6_conn; -} -EXPORT_SYMBOL(sfe_ipv6_max_conn_count); - -/* - * sfe_ipv6_notify_register() - * Register a notifier callback for IPv6 messages from SFE - * - * @param one_rule_cb The callback pointer of one rule sync - * @param many_rule_cb The callback pointer of many rule sync - * @param app_data The application context for this message - * - * @return struct sfe_ctx_instance * The SFE context - */ -struct sfe_ctx_instance *sfe_ipv6_notify_register(sfe_ipv6_msg_callback_t one_rule_cb, - sfe_ipv6_msg_callback_t many_rule_cb, void *app_data) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - spin_lock_bh(&sfe_ctx->lock); - /* - * Hook the shortcut sync callback. - */ - if (one_rule_cb && !sfe_ctx->ipv6_stats_sync_cb) { - sfe_ipv6_register_sync_rule_callback(sfe_ipv6_stats_sync_callback); - } - rcu_assign_pointer(sfe_ctx->ipv6_stats_sync_cb, one_rule_cb); - - if (many_rule_cb && !sfe_ctx->ipv6_stats_sync_many_cb) { - sfe_ipv6_register_many_sync_callback(sfe_ipv6_many_stats_sync_callback); - } - rcu_assign_pointer(sfe_ctx->ipv6_stats_sync_many_cb, many_rule_cb); - - sfe_ctx->ipv6_stats_sync_data = app_data; - - spin_unlock_bh(&sfe_ctx->lock); - - return SFE_CTX_TO_PUBLIC(sfe_ctx); -} -EXPORT_SYMBOL(sfe_ipv6_notify_register); - -/* - * sfe_ipv6_notify_unregister() - * Un-Register a notifier callback for IPv6 messages from SFE - */ -void sfe_ipv6_notify_unregister(void) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - spin_lock_bh(&sfe_ctx->lock); - /* - * Unregister our sync callback. - */ - if (sfe_ctx->ipv6_stats_sync_cb) { - sfe_ipv6_register_sync_rule_callback(NULL); - rcu_assign_pointer(sfe_ctx->ipv6_stats_sync_cb, NULL); - } - - if (sfe_ctx->ipv6_stats_sync_many_cb) { - sfe_ipv6_register_many_sync_callback(NULL); - rcu_assign_pointer(sfe_ctx->ipv6_stats_sync_many_cb, NULL); - } - - sfe_ctx->ipv6_stats_sync_data = NULL; - spin_unlock_bh(&sfe_ctx->lock); - - sfe_clean_response_msg_by_type(sfe_ctx, SFE_MSG_TYPE_IPV6); - return; -} -EXPORT_SYMBOL(sfe_ipv6_notify_unregister); - -/* - * sfe_tun6rd_tx() - * Transmit a tun6rd message to sfe engine - */ -sfe_tx_status_t sfe_tun6rd_tx(struct sfe_ctx_instance *sfe_ctx, struct sfe_tun6rd_msg *msg) -{ - sfe_incr_exceptions(SFE_EXCEPTION_NOT_SUPPORT_6RD); - return SFE_TX_FAILURE_NOT_ENABLED; -} -EXPORT_SYMBOL(sfe_tun6rd_tx); - -/* - * sfe_tun6rd_msg_init() - * Initialize sfe_tun6rd msg. - */ -void sfe_tun6rd_msg_init(struct sfe_tun6rd_msg *ncm, u16 if_num, u32 type, u32 len, void *cb, void *app_data) -{ - sfe_cmn_msg_init(&ncm->cm, if_num, type, len, cb, app_data); -} -EXPORT_SYMBOL(sfe_tun6rd_msg_init); - -/* - * sfe_recv() - * Handle packet receives. - * - * Returns 1 if the packet is forwarded or 0 if it isn't. - */ -int sfe_recv(struct sk_buff *skb) -{ - struct net_device *dev; - struct sfe_l2_info l2_info; - int ret; - - dev = skb->dev; - - /* - * Setting parse flags to 0 since l2_info is passed for non L2.5 header case as well - */ - l2_info.parse_flags = 0; - l2_info.vlan_hdr_cnt = 0; - -#ifdef CONFIG_NET_CLS_ACT - /* - * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet - * We can not accelerate this packet. - */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 4, 0)) - if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) { - return 0; - } -#else - if (rcu_access_pointer(dev->miniq_ingress) && !skb->tc_skip_classify) { - return 0; - } -#endif -#endif - - /* - * If l2_feature is enabled, we need not check if src dev is L3 interface since bridge flow offload is supported. - * If l2_feature is disabled, then we make sure src dev is L3 interface to avoid cost of rule lookup for L2 flows - */ - switch (ntohs(skb->protocol)) { - case ETH_P_IP: - if (likely(sfe_is_l2_feature_enabled()) || sfe_dev_is_layer_3_interface(dev, true)) { - return sfe_ipv4_recv(dev, skb, &l2_info, false); - } - - DEBUG_TRACE("No IPv4 address for device: %s skb=%px\n", dev->name, skb); - return 0; - - case ETH_P_IPV6: - if (likely(sfe_is_l2_feature_enabled()) || sfe_dev_is_layer_3_interface(dev, false)) { - return sfe_ipv6_recv(dev, skb, &l2_info, false); - } - - DEBUG_TRACE("No IPv6 address for device: %s skb=%px\n", dev->name, skb); - return 0; - - default: - break; - } - - /* - * Stop L2 processing if L2 feature is disabled. - */ - if (!sfe_is_l2_feature_enabled()) { - DEBUG_TRACE("Unsupported protocol %#x %s (L2 feature is disabled) skb=%px\n", - ntohs(skb->protocol), dev->name, skb); - return 0; - } - - /* - * Parse the L2 headers to find the L3 protocol and the L2 header offset - */ - if (unlikely(!sfe_recv_parse_l2(dev, skb, &l2_info))) { - DEBUG_TRACE("%px: Invalid L2.5 header format with protocol : %x\n", skb, ntohs(skb->protocol)); - goto send_to_linux; - } - - /* - * Protocol in l2_info is expected to be in host byte order. - * PPPoE is doing it in the sfe_pppoe_parse_hdr() - */ - if (likely(l2_info.protocol == ETH_P_IP)) { - ret = sfe_ipv4_recv(dev, skb, &l2_info, false); - if (unlikely(!ret)) { - goto send_to_linux; - } - return ret; - } - - if (likely(l2_info.protocol == ETH_P_IPV6)) { - ret = sfe_ipv6_recv(dev, skb, &l2_info, false); - if (unlikely(!ret)) { - goto send_to_linux; - } - return ret; - } - - DEBUG_TRACE("Non-IP(%x) %s skb=%px skb_vlan:%x/%x/%x skb_proto=%x\n", - l2_info.protocol, dev->name, skb, - ntohs(skb->vlan_proto), skb->vlan_tci, skb_vlan_tag_present(skb), - htons(skb->protocol)); - -send_to_linux: - /* - * Push the data back before sending to linux if - - * a. There is any exception from IPV4/V6 - * b. If the next protocol is neither IPV4 nor IPV6 - */ - sfe_recv_undo_parse_l2(dev, skb, &l2_info); - - return 0; -} - -/* - * sfe_get_exceptions() - * Dump exception counters - */ -static ssize_t sfe_get_exceptions(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - int idx, len; - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - spin_lock_bh(&sfe_ctx->lock); - for (len = 0, idx = 0; idx < SFE_EXCEPTION_MAX; idx++) { - if (sfe_ctx->exceptions[idx]) { - len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_exception_events_string[idx], sfe_ctx->exceptions[idx]); - } - } - spin_unlock_bh(&sfe_ctx->lock); - - return len; -} - -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_exceptions_attr = - __ATTR(exceptions, S_IRUGO, sfe_get_exceptions, NULL); - - -/* - * sfe_service_class_stats_get() - * Collects ipv4 and ipv6 service class statistics and aggregates them. - */ -bool sfe_service_class_stats_get(uint8_t sid, uint64_t *bytes, uint64_t *packets) -{ - *bytes = 0; - *packets = 0; - - if (!sfe_ipv4_service_class_stats_get(sid, bytes, packets)) { - return false; - } - - if (!sfe_ipv6_service_class_stats_get(sid, bytes, packets)) { - return false; - } - - return true; -} -EXPORT_SYMBOL(sfe_service_class_stats_get); - -/* - * sfe_is_l2_feature_enabled() - * Check if l2 features flag feature is enabled or not. (VLAN, PPPOE, BRIDGE and tunnels) - * - * 32bit read is atomic. No need of locks. - */ -bool sfe_is_l2_feature_enabled() -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - return (sfe_ctx->l2_feature_support == 1); -} -EXPORT_SYMBOL(sfe_is_l2_feature_enabled); - -/* - * sfe_get_l2_feature() - * L2 feature is enabled/disabled - */ -ssize_t sfe_get_l2_feature(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - ssize_t len; - - spin_lock_bh(&sfe_ctx->lock); - len = snprintf(buf, (ssize_t)(PAGE_SIZE), "L2 feature is %s\n", sfe_ctx->l2_feature_support ? "enabled" : "disabled"); - spin_unlock_bh(&sfe_ctx->lock); - return len; -} - -/* - * sfe_set_l2_feature() - * Enable or disable l2 features flag. - */ -ssize_t sfe_set_l2_feature(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - int ret; - ret = sscanf(buf, "%lu", &val); - - if (ret != 1) { - pr_err("Wrong input, %s\n", buf); - return -EINVAL; - } - - if (val != 1 && val != 0) { - pr_err("Input should be either 1 or 0, (%s)\n", buf); - return -EINVAL; - } - - spin_lock_bh(&sfe_ctx->lock); - - if (sfe_ctx->l2_feature_support && val) { - spin_unlock_bh(&sfe_ctx->lock); - pr_err("L2 feature is already enabled\n"); - return -EINVAL; - } - - if (!sfe_ctx->l2_feature_support && !val) { - spin_unlock_bh(&sfe_ctx->lock); - pr_err("L2 feature is already disabled\n"); - return -EINVAL; - } - - sfe_ctx->l2_feature_support = val; - spin_unlock_bh(&sfe_ctx->lock); - - return count; -} - -static const struct device_attribute sfe_l2_feature_attr = - __ATTR(l2_feature, 0644, sfe_get_l2_feature, sfe_set_l2_feature); - -/* - * sfe_get_pppoe_br_accel_mode() - * Get PPPoE bridge acceleration mode - */ -static ssize_t sfe_get_pppoe_br_accel_mode(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - int len; - sfe_pppoe_br_accel_mode_t mode; - char *str; - - mode = sfe_pppoe_get_br_accel_mode(); - switch ((int)mode) { - case SFE_PPPOE_BR_ACCEL_MODE_DISABLED: - str = "ACCEL_MODE_DISABLED"; - break; - - case SFE_PPPOE_BR_ACCEL_MODE_EN_5T: - str = "ACCEL_MODE_5_TUPLE"; - break; - - case SFE_PPPOE_BR_ACCEL_MODE_EN_3T: - str = "ACCEL_MODE_3_TUPLE"; - break; - - default: - str = "Unknown ACCEL_MODE"; - break; - } - len = snprintf(buf, PAGE_SIZE, "%s\n", str); - - return len; -} - -/* - * sfe_set_pppoe_br_accel_mode() - * Set PPPoE bridge acceleration mode - */ -static ssize_t sfe_set_pppoe_br_accel_mode(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - uint32_t val; - int ret; - - ret = sscanf(buf, "%u", &val); - if (ret != 1) { - DEBUG_ERROR("Unable to write the mode\n"); - return -EINVAL; - } - - ret = sfe_pppoe_set_br_accel_mode(val); - if (ret) { - DEBUG_ERROR("Wrong input: %d\n" - "Input should be %u or %u or %u\n" - "(%u==ACCEL_MODE_DISABLED %u==ACCEL_MODE_EN_5T %u==ACCEL_MODE_EN_3T)\n", - val, - SFE_PPPOE_BR_ACCEL_MODE_DISABLED, SFE_PPPOE_BR_ACCEL_MODE_EN_5T, SFE_PPPOE_BR_ACCEL_MODE_EN_3T, - SFE_PPPOE_BR_ACCEL_MODE_DISABLED, SFE_PPPOE_BR_ACCEL_MODE_EN_5T, SFE_PPPOE_BR_ACCEL_MODE_EN_3T); - return -EINVAL; - } - - return count; -} - -static const struct device_attribute sfe_pppoe_br_accel_mode_attr = - __ATTR(pppoe_br_accel_mode, 0644, sfe_get_pppoe_br_accel_mode, sfe_set_pppoe_br_accel_mode); - -/* - * sfe_init_if() - */ -int sfe_init_if(void) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - int result = -1; - - /* - * L2 feature is enabled by default - */ - sfe_ctx->l2_feature_support = 1; - - /* - * Create sys/sfe - */ - sfe_ctx->sys_sfe = kobject_create_and_add("sfe", NULL); - if (!sfe_ctx->sys_sfe) { - DEBUG_ERROR("failed to register sfe\n"); - goto exit1; - } - - /* - * Create sys/sfe/exceptions - */ - result = sysfs_create_file(sfe_ctx->sys_sfe, &sfe_exceptions_attr.attr); - if (result) { - DEBUG_ERROR("failed to register exceptions file: %d\n", result); - goto exit2; - } - - /* - * Create sys/sfe/l2_feature - */ - result = sysfs_create_file(sfe_ctx->sys_sfe, &sfe_l2_feature_attr.attr); - if (result) { - DEBUG_ERROR("failed to register L2 feature flag sysfs file: %d\n", result); - goto exit2; - } - - /* - * Create sys/sfe/pppoe_br_accel_mode - */ - result = sysfs_create_file(sfe_ctx->sys_sfe, &sfe_pppoe_br_accel_mode_attr.attr); - if (result) { - DEBUG_ERROR("failed to create pppoe_br_accel_mode: %d\n", result); - goto exit2; - } - - sfe_pppoe_mgr_init(); - - spin_lock_init(&sfe_ctx->lock); - - INIT_LIST_HEAD(&sfe_ctx->msg_queue); - INIT_WORK(&sfe_ctx->work, sfe_process_response_msg); - - /* - * Hook the receive path in the network stack. - */ - BUG_ON(athrs_fast_nat_recv); - RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_recv); - - return 0; -exit2: - kobject_put(sfe_ctx->sys_sfe); -exit1: - return result; -} - -/* - * sfe_exit_if() - */ -void sfe_exit_if(void) -{ - struct sfe_ctx_instance_internal *sfe_ctx = &__sfe_ctx; - - /* - * Unregister our receive callback. - */ - RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); - - sfe_pppoe_mgr_exit(); - - /* - * Wait for all callbacks to complete. - */ - rcu_barrier(); - - /* - * Destroy all connections. - */ - sfe_ipv4_destroy_all_rules_for_dev(NULL); - sfe_ipv6_destroy_all_rules_for_dev(NULL); - - /* - * stop work queue, and flush all pending message in queue - */ - cancel_work_sync(&sfe_ctx->work); - sfe_process_response_msg(&sfe_ctx->work); - - /* - * Unregister our sync callback. - */ - sfe_ipv4_notify_unregister(); - sfe_ipv6_notify_unregister(); - - kobject_put(sfe_ctx->sys_sfe); - - return; -} diff --git a/shortcut-fe/sfe.h b/shortcut-fe/sfe.h deleted file mode 100644 index e246b534f..000000000 --- a/shortcut-fe/sfe.h +++ /dev/null @@ -1,331 +0,0 @@ -/* - * sfe.h - * Shortcut forwarding engine. - * - * Copyright (c) 2013-2016, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __SFE_H -#define __SFE_H - -/* - * Maximum number of accelerated IPv4 or IPv6 connections - */ -#if defined(SFE_MEM_PROFILE_LOW) -#define SFE_MAX_CONNECTION_NUM 512 -#elif defined(SFE_MEM_PROFILE_MEDIUM) -#define SFE_MAX_CONNECTION_NUM 2048 -#else -#define SFE_MAX_CONNECTION_NUM 4096 -#endif - -#define SFE_L2_PARSE_FLAGS_PPPOE_INGRESS 0x01 /* Indicates presence of a valid PPPoE header */ - -/** - * SAWF_metadata information placement in mark field. - */ -#define SFE_SAWF_VALID_TAG 0xAA -#define SFE_SAWF_TAG_SHIFT 0x18 -#define SFE_SAWF_SERVICE_CLASS_SHIFT 0x10 -#define SFE_SAWF_SERVICE_CLASS_MASK 0xff -#define SFE_SAWF_MSDUQ_MASK 0xffff - -/** - * SAWF_metadata extraction. - */ -#define SFE_GET_SAWF_TAG(x) (x>>SFE_SAWF_TAG_SHIFT) -#define SFE_GET_SAWF_SERVICE_CLASS(x) ((x>>SFE_SAWF_SERVICE_CLASS_SHIFT) & SFE_SAWF_SERVICE_CLASS_MASK) -#define SFE_GET_SAWF_MSDUQ(x) (x & SFE_SAWF_MSDUQ_MASK) -#define SFE_SAWF_TAG_IS_VALID(x) ((x == SFE_SAWF_VALID_TAG) ? true : false) - -/* - * IPv6 address structure - */ -struct sfe_ipv6_addr { - __be32 addr[4]; -}; - -typedef union { - __be32 ip; - struct sfe_ipv6_addr ip6[1]; -} sfe_ip_addr_t; - -typedef enum sfe_sync_reason { - SFE_SYNC_REASON_STATS, /* Sync is to synchronize stats */ - SFE_SYNC_REASON_FLUSH, /* Sync is to flush a entry */ - SFE_SYNC_REASON_DESTROY /* Sync is to destroy a entry(requested by connection manager) */ -} sfe_sync_reason_t; - -/* - * VLAN header (aka VLAN tag) - */ -struct sfe_vlan_hdr { - u16 tpid; /* Tag Protocol Identifier */ - u16 tci; /* Tag Control Information */ -}; - -/* - * Structure used to store L2 information - */ -struct sfe_l2_info { - u16 parse_flags; /* Flags indicating L2.5 headers presence */ - u16 pppoe_session_id; /* PPPOE header offset */ - u16 protocol; /* L3 Protocol */ - struct sfe_vlan_hdr vlan_hdr[SFE_MAX_VLAN_DEPTH]; - /* VLAN tag(s) of ingress packet */ - u8 vlan_hdr_cnt; /* Number of VLAN tags in the ingress packet */ -}; - -/* - * Structure used to sync connection stats/state back within the system. - * - * NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing. - * 'src' is the creator of the connection. - */ -struct sfe_connection_sync { - struct net_device *src_dev; - struct net_device *dest_dev; - int is_v6; /* Is it for ipv6? */ - int protocol; /* IP protocol number (IPPROTO_...) */ - sfe_ip_addr_t src_ip; /* Non-NAT source address, i.e. the creator of the connection */ - sfe_ip_addr_t src_ip_xlate; /* NATed source address */ - __be16 src_port; /* Non-NAT source port */ - __be16 src_port_xlate; /* NATed source port */ - sfe_ip_addr_t dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */ - sfe_ip_addr_t dest_ip_xlate; /* NATed destination address */ - __be16 dest_port; /* Non-NAT destination port */ - __be16 dest_port_xlate; /* NATed destination port */ - u32 src_td_max_window; - u32 src_td_end; - u32 src_td_max_end; - u64 src_packet_count; - u64 src_byte_count; - u32 src_new_packet_count; - u32 src_new_byte_count; - u32 dest_td_max_window; - u32 dest_td_end; - u32 dest_td_max_end; - u64 dest_packet_count; - u64 dest_byte_count; - u32 dest_new_packet_count; - u32 dest_new_byte_count; - u32 reason; /* reason for stats sync message, i.e. destroy, flush, period sync */ - u64 delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */ -}; - -/* - * Expose the hook for the receive processing. - */ -extern int (*athrs_fast_nat_recv)(struct sk_buff *skb); - -/* - * Expose what should be a static flag in the TCP connection tracker. - */ -extern int nf_ct_tcp_no_window_check; - -/* - * Check the fast transmit feasibility. - */ -bool sfe_fast_xmit_check(struct sk_buff *skb, netdev_features_t features); - -/* - * This callback will be called in a timer - * at 100 times per second to sync stats back to - * Linux connection track. - * - * A RCU lock is taken to prevent this callback - * from unregistering. - */ -typedef void (*sfe_sync_rule_callback_t)(struct sfe_connection_sync *); -typedef void (*sfe_ipv4_many_sync_callback_t)(struct sfe_ipv4_msg *msg); -typedef void (*sfe_ipv6_many_sync_callback_t)(struct sfe_ipv6_msg *msg); - -/* - * IPv4 APIs used by connection manager - */ -int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info, bool tun_outer); -int sfe_ipv4_create_rule(struct sfe_ipv4_rule_create_msg *msg); -void sfe_ipv4_destroy_rule(struct sfe_ipv4_rule_destroy_msg *msg); -void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev); -void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t callback); -void sfe_ipv4_update_rule(struct sfe_ipv4_rule_create_msg *msg); -bool sfe_dev_has_hw_csum(struct net_device *dev); - -bool sfe_ipv4_sync_invoke(uint16_t index); -void sfe_ipv4_register_many_sync_callback(sfe_ipv4_many_sync_callback_t cb); -void sfe_ipv4_stats_convert(struct sfe_ipv4_conn_sync *sync_msg, struct sfe_connection_sync *sis); -#ifdef SFE_SUPPORT_IPV6 -/* - * IPv6 APIs used by connection manager - */ -int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info, bool tun_outer); -int sfe_ipv6_create_rule(struct sfe_ipv6_rule_create_msg *msg); -void sfe_ipv6_destroy_rule(struct sfe_ipv6_rule_destroy_msg *msg); -void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev); -void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback); -void sfe_ipv6_update_rule(struct sfe_ipv6_rule_create_msg *msg); -bool sfe_ipv6_sync_invoke(uint16_t index); -void sfe_ipv6_register_many_sync_callback(sfe_ipv6_many_sync_callback_t cb); -void sfe_ipv6_stats_convert(struct sfe_ipv6_conn_sync *sync_msg, struct sfe_connection_sync *sis); -#else -static inline int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info, bool tun_outer) -{ - return 0; -} - -static inline int sfe_ipv6_create_rule(struct sfe_ipv6_rule_create_msg *msg) -{ - return 0; -} - -static inline void sfe_ipv6_destroy_rule(struct sfe_ipv6_rule_destroy_msg *msg) -{ - return; -} - -static inline void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) -{ - return; -} - -static inline void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback) -{ - return; -} - -static inline void sfe_ipv6_update_rule(struct sfe_ipv6_rule_create_msg *msg) -{ - return; -} - -static inline bool sfe_ipv6_sync_invoke(uint16_t index) -{ - return false; -} - -static inline void sfe_ipv6_register_many_sync_callback(sfe_ipv6_many_sync_callback_t cb) -{ - return; -} - -static inline void sfe_ipv6_stats_convert(struct sfe_ipv6_conn_sync *sync_msg, struct sfe_connection_sync *sis) -{ - return; -} -#endif - -/* - * sfe_ipv6_addr_equal() - * compare ipv6 address - * - * return: 1, equal; 0, no equal - */ -static inline int sfe_ipv6_addr_equal(struct sfe_ipv6_addr *a, - struct sfe_ipv6_addr *b) -{ - return a->addr[0] == b->addr[0] && - a->addr[1] == b->addr[1] && - a->addr[2] == b->addr[2] && - a->addr[3] == b->addr[3]; -} - -/* - * sfe_ipv4_addr_equal() - * compare ipv4 address - * - * return: 1, equal; 0, no equal - */ -#define sfe_ipv4_addr_equal(a, b) ((u32)(a) == (u32)(b)) - -/* - * sfe_addr_equal() - * compare ipv4 or ipv6 address - * - * return: 1, equal; 0, no equal - */ -static inline int sfe_addr_equal(sfe_ip_addr_t *a, - sfe_ip_addr_t *b, int is_v4) -{ - return is_v4 ? sfe_ipv4_addr_equal(a->ip, b->ip) : sfe_ipv6_addr_equal(a->ip6, b->ip6); -} - -/* - * sfe_l2_parse_flag_set() - * Set L2 parse flag - */ -static inline void sfe_l2_parse_flag_set(struct sfe_l2_info *l2_info, u16 flag) -{ - l2_info->parse_flags |= flag; -} - -/* - * sfe_l2_parse_flag_get() - * Get L2 parse flag - */ -static inline u16 sfe_l2_parse_flag_get(struct sfe_l2_info *l2_info) -{ - return l2_info->parse_flags; -} - -/* - * sfe_l2_parse_flag_check() - * Check L2 parse flag - */ -static inline bool sfe_l2_parse_flag_check(struct sfe_l2_info *l2_info, u16 flag) -{ - return !!(l2_info->parse_flags & flag); -} - -/* - * sfe_l2_pppoe_session_id_get() - * Get PPPPoE session ID from l2_info - */ -static inline u16 sfe_l2_pppoe_session_id_get(struct sfe_l2_info *l2_info) -{ - return l2_info->pppoe_session_id; -} - -/* - * sfe_l2_pppoe_session_id_set() - * Set PPPoE session ID to l2_info - */ -static inline void sfe_l2_pppoe_session_id_set(struct sfe_l2_info *l2_info, u16 session_id) -{ - l2_info->pppoe_session_id = session_id; -} - -/* - * sfe_l2_protocol_get() - * Get L2 protocol - */ -static inline u16 sfe_l2_protocol_get(struct sfe_l2_info *l2_info) -{ - return l2_info->protocol; -} - -/* - * sfe_l2_protocol_set() - * Set L2 protocol - */ -static inline void sfe_l2_protocol_set(struct sfe_l2_info *l2_info, u16 proto) -{ - l2_info->protocol = proto; -} - -int sfe_init_if(void); -void sfe_exit_if(void); - -#endif /* __SFE_H */ diff --git a/shortcut-fe/sfe_debug.h b/shortcut-fe/sfe_debug.h deleted file mode 100644 index 77d1cc29c..000000000 --- a/shortcut-fe/sfe_debug.h +++ /dev/null @@ -1,72 +0,0 @@ - -/* - * sfe_debug.h - * SFE debug macros. - * - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* - * The following are debug macros used throughout the SFE. - * - * The DEBUG_LEVEL enables the followings based on its value, - * when dynamic debug option is disabled. - * - * 0 = OFF - * 1 = ASSERTS / ERRORS - * 2 = 1 + WARN - * 3 = 2 + INFO - * 4 = 3 + TRACE - */ -#define DEBUG_LEVEL 2 - -#if (DEBUG_LEVEL < 1) -#define DEBUG_ASSERT(s, ...) -#define DEBUG_ERROR(s, ...) -#else -#define DEBUG_ASSERT(c, s, ...) if (!(c)) { pr_emerg("ASSERT: %s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__); BUG(); } -#define DEBUG_ERROR(s, ...) pr_err("%s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#endif - -#if defined(CONFIG_DYNAMIC_DEBUG) -/* - * Compile messages for dynamic enable/disable - */ -#define DEBUG_WARN(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#define DEBUG_INFO(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#define DEBUG_TRACE(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#else - -/* - * Statically compile messages at different levels - */ -#if (DEBUG_LEVEL < 2) -#define DEBUG_WARN(s, ...) -#else -#define DEBUG_WARN(s, ...) pr_warn("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#endif - -#if (DEBUG_LEVEL < 3) -#define DEBUG_INFO(s, ...) -#else -#define DEBUG_INFO(s, ...) pr_notice("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#endif - -#if (DEBUG_LEVEL < 4) -#define DEBUG_TRACE(s, ...) -#else -#define DEBUG_TRACE(s, ...) pr_info("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) -#endif -#endif diff --git a/shortcut-fe/sfe_init.c b/shortcut-fe/sfe_init.c deleted file mode 100644 index 2fe34d751..000000000 --- a/shortcut-fe/sfe_init.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * sfe_init.c - * Shortcut forwarding engine initialization. - * - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_ipv4.h" -#include "sfe_ipv6.h" - -int max_ipv4_conn = SFE_MAX_CONNECTION_NUM; -module_param(max_ipv4_conn, int, S_IRUGO); -MODULE_PARM_DESC(max_ipv4_conn, "Max number of IPv4 connections"); - -int max_ipv6_conn = SFE_MAX_CONNECTION_NUM; -module_param(max_ipv6_conn, int, S_IRUGO); -MODULE_PARM_DESC(max_ipv6_conn, "Max number of IPv6 connections"); - -/* - * sfe_init() - * Initialize SFE engine. - */ -static int __init sfe_init(void) -{ - /* - * Initialize SFE IPv4 engine. - */ - if (sfe_ipv4_init()) { - goto fail0; - } - -#ifdef SFE_SUPPORT_IPV6 - - /* - * Initialize SFE IPv6 engine. - */ - if (sfe_ipv6_init()) { - goto fail1; - } -#endif - - /* - * Initialize SFE infrastructure and register SFE hook with Linux stack - */ - if (sfe_init_if()) { - goto fail2; - } - - return 0; - -fail2: -#ifdef SFE_SUPPORT_IPV6 - sfe_ipv6_exit(); -fail1: -#endif - - sfe_ipv4_exit(); - -fail0: - - return -1; -} - -/* - * sfe_exit() - */ -static void __exit sfe_exit(void) -{ - - sfe_exit_if(); - -#ifdef SFE_SUPPORT_IPV6 - sfe_ipv6_exit(); -#endif - sfe_ipv4_exit(); -} - -module_init(sfe_init) -module_exit(sfe_exit) - -MODULE_AUTHOR("Qualcomm Technologies"); -MODULE_DESCRIPTION("Shortcut Forwarding Engine"); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/shortcut-fe/sfe_ipv4.c b/shortcut-fe/sfe_ipv4.c deleted file mode 100644 index aa59de523..000000000 --- a/shortcut-fe/sfe_ipv4.c +++ /dev/null @@ -1,2961 +0,0 @@ -/* - * sfe_ipv4.c - * Shortcut forwarding engine - IPv4 edition. - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" -#include "sfe_ipv4_udp.h" -#include "sfe_ipv4_tcp.h" -#include "sfe_ipv4_icmp.h" -#include "sfe_pppoe.h" -#include "sfe_pppoe_mgr.h" -#include "sfe_ipv4_pppoe_br.h" -#include "sfe_ipv4_gre.h" -#include "sfe_ipv4_tun6rd.h" -#include "sfe_ipv4_esp.h" - -static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = { - "UDP_HEADER_INCOMPLETE", - "UDP_NO_CONNECTION", - "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "UDP_SMALL_TTL", - "UDP_NEEDS_FRAGMENTATION", - "TCP_HEADER_INCOMPLETE", - "TCP_NO_CONNECTION_SLOW_FLAGS", - "TCP_NO_CONNECTION_FAST_FLAGS", - "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "TCP_SMALL_TTL", - "TCP_NEEDS_FRAGMENTATION", - "TCP_FLAGS", - "TCP_SEQ_EXCEEDS_RIGHT_EDGE", - "TCP_SMALL_DATA_OFFS", - "TCP_BAD_SACK", - "TCP_BIG_DATA_OFFS", - "TCP_SEQ_BEFORE_LEFT_EDGE", - "TCP_ACK_EXCEEDS_RIGHT_EDGE", - "TCP_ACK_BEFORE_LEFT_EDGE", - "ICMP_HEADER_INCOMPLETE", - "ICMP_UNHANDLED_TYPE", - "ICMP_IPV4_HEADER_INCOMPLETE", - "ICMP_IPV4_NON_V4", - "ICMP_IPV4_IP_OPTIONS_INCOMPLETE", - "ICMP_IPV4_UDP_HEADER_INCOMPLETE", - "ICMP_IPV4_TCP_HEADER_INCOMPLETE", - "ICMP_IPV4_UNHANDLED_PROTOCOL", - "ICMP_NO_CONNECTION", - "ICMP_FLUSHED_CONNECTION", - "HEADER_INCOMPLETE", - "HEADER_CSUM_BAD", - "BAD_TOTAL_LENGTH", - "NON_V4", - "NON_INITIAL_FRAGMENT", - "DATAGRAM_INCOMPLETE", - "IP_OPTIONS_INCOMPLETE", - "UNHANDLED_PROTOCOL", - "NO_HEADROOM", - "INVALID_PPPOE_SESSION", - "INCORRECT_PPPOE_PARSING", - "PPPOE_NOT_SET_IN_CME", - "PPPOE_BR_NOT_IN_CME", - "INGRESS_VLAN_TAG_MISMATCH", - "INVALID_SOURCE_INTERFACE", - "TUN6RD_NO_CONNECTION", - "TUN6RD_NEEDS_FRAGMENTATION", - "TUN6RD_SYNC_ON_FIND", - "GRE_HEADER_INCOMPLETE", - "GRE_NO_CONNECTION", - "GRE_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "GRE_SMALL_TTL", - "GRE_NEEDS_FRAGMENTATION", - "ESP_NO_CONNECTION", - "ESP_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "ESP_NEEDS_FRAGMENTATION", - "ESP_SMALL_TTL" -}; - -static struct sfe_ipv4 __si; -struct sfe_ipv4_msg *sfe_ipv4_sync_many_msg; -uint32_t sfe_ipv4_sync_max_number; - -/* - * sfe_ipv4_gen_ip_csum() - * Generate the IP checksum for an IPv4 header. - * - * Note that this function assumes that we have only 20 bytes of IP header. - */ -u16 sfe_ipv4_gen_ip_csum(struct iphdr *iph) -{ - u32 sum; - u16 *i = (u16 *)iph; - - iph->check = 0; - - /* - * Generate the sum. - */ - sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9]; - - /* - * Fold it to ones-complement form. - */ - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - - return (u16)sum ^ 0xffff; -} - -/* - * sfe_ipv4_get_connection_match_hash() - * Generate the hash used in connection match lookups. - */ -static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, u8 protocol, - __be32 src_ip, __be16 src_port, - __be32 dest_ip, __be16 dest_port) -{ - u32 hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); - return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; -} - -/* - * sfe_ipv4_find_connection_match_rcu() - * Get the IPv4 flow match info that corresponds to a particular 5-tuple. - * - * On entry we must be holding the lock that protects the hash table. - */ -struct sfe_ipv4_connection_match * -sfe_ipv4_find_connection_match_rcu(struct sfe_ipv4 *si, struct net_device *dev, u8 protocol, - __be32 src_ip, __be16 src_port, - __be32 dest_ip, __be16 dest_port) -{ - struct sfe_ipv4_connection_match *cm = NULL; - unsigned int conn_match_idx; - struct hlist_head *lhead; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); - - lhead = &si->hlist_conn_match_hash_head[conn_match_idx]; - - hlist_for_each_entry_rcu(cm, lhead, hnode) { - if (cm->match_src_port != src_port - || cm->match_dest_port != dest_port - || cm->match_src_ip != src_ip - || cm->match_dest_ip != dest_ip - || cm->match_protocol != protocol) { - continue; - } - - this_cpu_inc(si->stats_pcpu->connection_match_hash_hits64); - - break; - } - - return cm; -} - -/* - * sfe_ipv4_connection_match_update_summary_stats() - * Update the summary stats for a connection match entry. - * - * Stats are incremented atomically. So use atomic substraction to update summary - * stats. - */ -static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm, - u32 *packets, u32 *bytes) -{ - u32 packet_count, byte_count; - - packet_count = atomic_read(&cm->rx_packet_count); - cm->rx_packet_count64 += packet_count; - atomic_sub(packet_count, &cm->rx_packet_count); - - byte_count = atomic_read(&cm->rx_byte_count); - cm->rx_byte_count64 += byte_count; - atomic_sub(byte_count, &cm->rx_byte_count); - - *packets = packet_count; - *bytes = byte_count; -} - -/* - * sfe_ipv4_connection_match_compute_translations() - * Compute port and address translations for a connection match entry. - */ -static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm) -{ - /* - * Before we insert the entry look to see if this is tagged as doing address - * translations. If it is then work out the adjustment that we need to apply - * to the transport checksum. - */ - if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { - /* - * Precompute an incremental checksum adjustment so we can - * edit packets in this stream very quickly. The algorithm is from RFC1624. - */ - u16 src_ip_hi = cm->match_src_ip >> 16; - u16 src_ip_lo = cm->match_src_ip & 0xffff; - u32 xlate_src_ip = ~cm->xlate_src_ip; - u16 xlate_src_ip_hi = xlate_src_ip >> 16; - u16 xlate_src_ip_lo = xlate_src_ip & 0xffff; - u16 xlate_src_port = ~cm->xlate_src_port; - u32 adj; - - /* - * When we compute this fold it down to a 16-bit offset - * as that way we can avoid having to do a double - * folding of the twos-complement result because the - * addition of 2 16-bit values cannot cause a double - * wrap-around! - */ - adj = src_ip_hi + src_ip_lo + cm->match_src_port - + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port; - adj = (adj & 0xffff) + (adj >> 16); - adj = (adj & 0xffff) + (adj >> 16); - cm->xlate_src_csum_adjustment = (u16)adj; - - } - - if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { - /* - * Precompute an incremental checksum adjustment so we can - * edit packets in this stream very quickly. The algorithm is from RFC1624. - */ - u16 dest_ip_hi = cm->match_dest_ip >> 16; - u16 dest_ip_lo = cm->match_dest_ip & 0xffff; - u32 xlate_dest_ip = ~cm->xlate_dest_ip; - u16 xlate_dest_ip_hi = xlate_dest_ip >> 16; - u16 xlate_dest_ip_lo = xlate_dest_ip & 0xffff; - u16 xlate_dest_port = ~cm->xlate_dest_port; - u32 adj; - - /* - * When we compute this fold it down to a 16-bit offset - * as that way we can avoid having to do a double - * folding of the twos-complement result because the - * addition of 2 16-bit values cannot cause a double - * wrap-around! - */ - adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port - + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port; - adj = (adj & 0xffff) + (adj >> 16); - adj = (adj & 0xffff) + (adj >> 16); - cm->xlate_dest_csum_adjustment = (u16)adj; - } - - if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { - u32 adj = ~cm->match_src_ip + cm->xlate_src_ip; - if (adj < cm->xlate_src_ip) { - adj++; - } - - adj = (adj & 0xffff) + (adj >> 16); - adj = (adj & 0xffff) + (adj >> 16); - cm->xlate_src_partial_csum_adjustment = (u16)adj; - } - - if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { - u32 adj = ~cm->match_dest_ip + cm->xlate_dest_ip; - if (adj < cm->xlate_dest_ip) { - adj++; - } - - adj = (adj & 0xffff) + (adj >> 16); - adj = (adj & 0xffff) + (adj >> 16); - cm->xlate_dest_partial_csum_adjustment = (u16)adj; - } - -} - -/* - * sfe_ipv4_update_summary_stats() - * Update the summary stats. - */ -static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si, struct sfe_ipv4_stats *stats) -{ - int i = 0; - - memset(stats, 0, sizeof(*stats)); - - for_each_possible_cpu(i) { - const struct sfe_ipv4_stats *s = per_cpu_ptr(si->stats_pcpu, i); - - stats->connection_create_requests64 += s->connection_create_requests64; - stats->connection_create_collisions64 += s->connection_create_collisions64; - stats->connection_create_failures64 += s->connection_create_failures64; - stats->connection_destroy_requests64 += s->connection_destroy_requests64; - stats->connection_destroy_misses64 += s->connection_destroy_misses64; - stats->connection_match_hash_hits64 += s->connection_match_hash_hits64; - stats->connection_match_hash_reorders64 += s->connection_match_hash_reorders64; - stats->connection_flushes64 += s->connection_flushes64; - stats->packets_dropped64 += s->packets_dropped64; - stats->packets_forwarded64 += s->packets_forwarded64; - stats->packets_fast_xmited64 += s->packets_fast_xmited64; - stats->packets_not_forwarded64 += s->packets_not_forwarded64; - stats->pppoe_encap_packets_forwarded64 += s->pppoe_encap_packets_forwarded64; - stats->pppoe_decap_packets_forwarded64 += s->pppoe_decap_packets_forwarded64; - stats->pppoe_bridge_packets_forwarded64 += s->pppoe_bridge_packets_forwarded64; - stats->pppoe_bridge_packets_3tuple_forwarded64 += s->pppoe_bridge_packets_3tuple_forwarded64; - } - -} - -/* - * sfe_ipv4_insert_connection_match() - * Insert a connection match into the hash. - */ -static inline void sfe_ipv4_insert_connection_match(struct sfe_ipv4 *si, - struct sfe_ipv4_connection_match *cm) -{ - unsigned int conn_match_idx - = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, - cm->match_src_ip, cm->match_src_port, - cm->match_dest_ip, cm->match_dest_port); - - lockdep_assert_held(&si->lock); - - hlist_add_head_rcu(&cm->hnode, &si->hlist_conn_match_hash_head[conn_match_idx]); -#ifdef CONFIG_NF_FLOW_COOKIE - if (!si->flow_cookie_enable) - return; - - /* - * Configure hardware to put a flow cookie in packet of this flow, - * then we can accelerate the lookup process when we received this packet. - */ - for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { - struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; - - if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { - flow_cookie_set_func_t func; - - rcu_read_lock(); - func = rcu_dereference(si->flow_cookie_set_func); - if (func) { - if (!func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, - cm->match_dest_ip, cm->match_dest_port, conn_match_idx)) { - entry->match = cm; - cm->flow_cookie = conn_match_idx; - } - } - rcu_read_unlock(); - - break; - } - } -#endif -} - -/* - * sfe_ipv4_remove_connection_match() - * Remove a connection match object from the hash. - */ -static inline void sfe_ipv4_remove_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm) -{ - - lockdep_assert_held(&si->lock); - -#ifdef CONFIG_NF_FLOW_COOKIE - if (si->flow_cookie_enable) { - /* - * Tell hardware that we no longer need a flow cookie in packet of this flow - */ - unsigned int conn_match_idx; - - for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { - struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; - - if (cm == entry->match) { - flow_cookie_set_func_t func; - - rcu_read_lock(); - func = rcu_dereference(si->flow_cookie_set_func); - if (func) { - func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, - cm->match_dest_ip, cm->match_dest_port, 0); - } - rcu_read_unlock(); - - cm->flow_cookie = 0; - entry->match = NULL; - entry->last_clean_time = jiffies; - break; - } - } - } -#endif - - hlist_del_init_rcu(&cm->hnode); - -} - -/* - * sfe_ipv4_get_connection_hash() - * Generate the hash used in connection lookups. - */ -static inline unsigned int sfe_ipv4_get_connection_hash(u8 protocol, __be32 src_ip, __be16 src_port, - __be32 dest_ip, __be16 dest_port) -{ - u32 hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port) ^ dest_port; - return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; -} - -/* - * sfe_ipv4_find_connection() - * Get the IPv4 connection info that corresponds to a particular 5-tuple. - * - * On entry we must be holding the lock that protects the hash table. - */ -static inline struct sfe_ipv4_connection *sfe_ipv4_find_connection(struct sfe_ipv4 *si, u32 protocol, - __be32 src_ip, __be16 src_port, - __be32 dest_ip, __be16 dest_port) -{ - struct sfe_ipv4_connection *c; - unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); - - lockdep_assert_held(&si->lock); - - c = si->conn_hash[conn_idx]; - - /* - * Will need connection entry for next create/destroy metadata, - * So no need to re-order entry for these requests - */ - while (c) { - if ((c->src_port == src_port) - && (c->dest_port == dest_port) - && (c->src_ip == src_ip) - && (c->dest_ip == dest_ip) - && (c->protocol == protocol)) { - return c; - } - - c = c->next; - } - - return NULL; -} - -/* - * sfe_ipv4_insert_connection() - * Insert a connection into the hash. - * - * On entry we must be holding the lock that protects the hash table. - */ -static void sfe_ipv4_insert_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) -{ - struct sfe_ipv4_connection **hash_head; - struct sfe_ipv4_connection *prev_head; - unsigned int conn_idx; - - lockdep_assert_held(&si->lock); - - /* - * Insert entry into the connection hash. - */ - conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, - c->dest_ip, c->dest_port); - hash_head = &si->conn_hash[conn_idx]; - prev_head = *hash_head; - c->prev = NULL; - if (prev_head) { - prev_head->prev = c; - } - - c->next = prev_head; - *hash_head = c; - - /* - * Insert entry into the "all connections" list. - */ - if (si->all_connections_tail) { - c->all_connections_prev = si->all_connections_tail; - si->all_connections_tail->all_connections_next = c; - } else { - c->all_connections_prev = NULL; - si->all_connections_head = c; - } - - si->all_connections_tail = c; - c->all_connections_next = NULL; - si->num_connections++; - - /* - * Insert the connection match objects too. - */ - sfe_ipv4_insert_connection_match(si, c->original_match); - sfe_ipv4_insert_connection_match(si, c->reply_match); -} - -/* - * sfe_ipv4_remove_connection() - * Remove a sfe_ipv4_connection object from the hash. - */ -bool sfe_ipv4_remove_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) -{ - lockdep_assert_held(&si->lock); - - if (c->removed) { - DEBUG_ERROR("%px: Connection has been removed already\n", c); - return false; - } - - /* - * dereference the decap direction top_interface_dev - */ - if (c->reply_match->top_interface_dev) { - dev_put(c->reply_match->top_interface_dev); - } - - /* - * Remove the connection match objects. - */ - sfe_ipv4_remove_connection_match(si, c->reply_match); - sfe_ipv4_remove_connection_match(si, c->original_match); - - /* - * Unlink the connection. - */ - if (c->prev) { - c->prev->next = c->next; - } else { - unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, - c->dest_ip, c->dest_port); - si->conn_hash[conn_idx] = c->next; - } - - if (c->next) { - c->next->prev = c->prev; - } - - /* - * Unlink connection from all_connections list - */ - if (c->all_connections_prev) { - c->all_connections_prev->all_connections_next = c->all_connections_next; - } else { - si->all_connections_head = c->all_connections_next; - } - - if (c->all_connections_next) { - c->all_connections_next->all_connections_prev = c->all_connections_prev; - } else { - si->all_connections_tail = c->all_connections_prev; - } - - /* - * If I am the next sync connection, move the sync to my next or head. - */ - if (unlikely(si->wc_next == c)) { - si->wc_next = c->all_connections_next; - } - - c->removed = true; - si->num_connections--; - return true; -} - -/* - * sfe_ipv4_gen_sync_connection() - * Sync a connection. - * - * On entry to this function we expect that the lock for the connection is either - * already held (while called from sfe_ipv4_periodic_sync() or isn't required - * (while called from sfe_ipv4_flush_connection()) - */ -static void sfe_ipv4_gen_sync_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, - struct sfe_connection_sync *sis, sfe_sync_reason_t reason, - u64 now_jiffies) -{ - struct sfe_ipv4_connection_match *original_cm; - struct sfe_ipv4_connection_match *reply_cm; - u32 packet_count, byte_count; - - /* - * Fill in the update message. - */ - sis->is_v6 = 0; - sis->protocol = c->protocol; - sis->src_ip.ip = c->src_ip; - sis->src_ip_xlate.ip = c->src_ip_xlate; - sis->dest_ip.ip = c->dest_ip; - sis->dest_ip_xlate.ip = c->dest_ip_xlate; - sis->src_port = c->src_port; - sis->src_port_xlate = c->src_port_xlate; - sis->dest_port = c->dest_port; - sis->dest_port_xlate = c->dest_port_xlate; - - original_cm = c->original_match; - reply_cm = c->reply_match; - sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; - sis->src_td_end = original_cm->protocol_state.tcp.end; - sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; - sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; - sis->dest_td_end = reply_cm->protocol_state.tcp.end; - sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; - - sfe_ipv4_connection_match_update_summary_stats(original_cm, &packet_count, &byte_count); - sis->src_new_packet_count = packet_count; - sis->src_new_byte_count = byte_count; - - sfe_ipv4_connection_match_update_summary_stats(reply_cm, &packet_count, &byte_count); - sis->dest_new_packet_count = packet_count; - sis->dest_new_byte_count = byte_count; - - sis->src_dev = original_cm->match_dev; - sis->src_packet_count = original_cm->rx_packet_count64; - sis->src_byte_count = original_cm->rx_byte_count64; - - sis->dest_dev = reply_cm->match_dev; - sis->dest_packet_count = reply_cm->rx_packet_count64; - sis->dest_byte_count = reply_cm->rx_byte_count64; - - sis->reason = reason; - - /* - * Get the time increment since our last sync. - */ - sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; - c->last_sync_jiffies = now_jiffies; -} - -/* - * sfe_ipv4_free_connection_rcu() - * Called at RCU qs state to free the connection object. - */ -static void sfe_ipv4_free_connection_rcu(struct rcu_head *head) -{ - struct sfe_ipv4_connection *c; - struct udp_sock *up; - struct sock *sk; - - /* - * We dont need spin lock as the connection is already removed from link list - */ - c = container_of(head, struct sfe_ipv4_connection, rcu); - - BUG_ON(!c->removed); - - DEBUG_TRACE("%px: connecton has been deleted\n", c); - - /* - * Decrease the refcount taken in function sfe_ipv4_create_rule(), - * during call of __udp4_lib_lookup() - */ - up = c->reply_match->up; - if (up) { - sk = (struct sock *)up; - sock_put(sk); - } - - /* - * Release our hold of the source and dest devices and free the memory - * for our connection objects. - */ - dev_put(c->original_dev); - dev_put(c->reply_dev); - kfree(c->original_match); - kfree(c->reply_match); - kfree(c); -} - -/* - * sfe_ipv4_sync_status() - * update a connection status to its connection manager. - * - * si: the ipv4 context - * c: which connection to be notified - * reason: what kind of notification: flush, stats or destroy - */ -void sfe_ipv4_sync_status(struct sfe_ipv4 *si, - struct sfe_ipv4_connection *c, - sfe_sync_reason_t reason) -{ - struct sfe_connection_sync sis; - u64 now_jiffies; - sfe_sync_rule_callback_t sync_rule_callback; - - rcu_read_lock(); - sync_rule_callback = rcu_dereference(si->sync_rule_callback); - rcu_read_unlock(); - if (!sync_rule_callback) { - return; - } - - /* - * Generate a sync message and then sync. - */ - now_jiffies = get_jiffies_64(); - sfe_ipv4_gen_sync_connection(si, c, &sis, reason, now_jiffies); - sync_rule_callback(&sis); -} - -/* - * sfe_ipv4_flush_connection() - * Flush a connection and free all associated resources. - * - * We need to be called with bottom halves disabled locally as we need to acquire - * the connection hash lock and release it again. In general we're actually called - * from within a BH and so we're fine, but we're also called when connections are - * torn down. - */ -void sfe_ipv4_flush_connection(struct sfe_ipv4 *si, - struct sfe_ipv4_connection *c, - sfe_sync_reason_t reason) -{ - BUG_ON(!c->removed); - - this_cpu_inc(si->stats_pcpu->connection_flushes64); - sfe_ipv4_sync_status(si, c, reason); - - /* - * Release our hold of the source and dest devices and free the memory - * for our connection objects. - */ - call_rcu(&c->rcu, sfe_ipv4_free_connection_rcu); -} - -/* - * sfe_ipv4_service_class_stats_inc() - * Increment per cpu per service class stats. - */ -void sfe_ipv4_service_class_stats_inc(struct sfe_ipv4 *si, uint8_t sid, uint64_t bytes) -{ - struct sfe_ipv4_service_class_stats_db *sc_stats_db = this_cpu_ptr(si->stats_pcpu_psc); - struct sfe_ipv4_per_service_class_stats *sc_stats = &sc_stats_db->psc_stats[sid]; - - write_seqcount_begin(&sc_stats->seq); - sc_stats->tx_bytes += bytes; - sc_stats->tx_packets++; - write_seqcount_end(&sc_stats->seq); -} - -/* - * sfe_ipv4_exception_stats_inc() - * Increment exception stats. - */ -void sfe_ipv4_exception_stats_inc(struct sfe_ipv4 *si, enum sfe_ipv4_exception_events reason) -{ - struct sfe_ipv4_stats *stats = this_cpu_ptr(si->stats_pcpu); - stats->exception_events64[reason]++; - stats->packets_not_forwarded64++; -} - -/* - * sfe_ipv4_is_loal_ip() - * Returns true if IP is local; returns false otherwise. - */ -static bool sfe_ipv4_is_local_ip(struct sfe_ipv4 *si, __be32 ip_addr) -{ - struct net_device *dev; - - dev = ip_dev_find(&init_net, ip_addr); - if (dev) { - dev_put(dev); - return true; - } - - return false; -} - -/* - * sfe_ipv4_recv() - * Handle packet receives and forwaring. - * - * Returns 1 if the packet is forwarded or 0 if it isn't. - */ -int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct sfe_ipv4 *si = &__si; - unsigned int len; - unsigned int tot_len; - unsigned int frag_off; - unsigned int ihl; - bool sync_on_find; - bool ip_options; - struct iphdr *iph; - u32 protocol; - - /* - * Check that we have space for an IP header here. - */ - len = skb->len; - if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr)))) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE); - DEBUG_TRACE("len: %u is too short\n", len); - return 0; - } - - /* - * Validate ip csum if necessary. If ip_summed is set to CHECKSUM_UNNECESSARY, it is assumed - * that the L3 checksum is validated by the Rx interface or the tunnel interface that has - * generated the packet. - */ - iph = (struct iphdr *)skb->data; - if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY) && (ip_fast_csum((u8 *)iph, iph->ihl))) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_HEADER_CSUM_BAD); - - DEBUG_TRACE("Bad IPv4 header csum: 0x%x\n", iph->check); - return 0; - } - - /* - * Check that our "total length" is large enough for an IP header. - */ - tot_len = ntohs(iph->tot_len); - if (unlikely(tot_len < sizeof(struct iphdr))) { - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH); - DEBUG_TRACE("tot_len: %u is too short\n", tot_len); - return 0; - } - - /* - * Is our IP version wrong? - */ - if (unlikely(iph->version != 4)) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NON_V4); - DEBUG_TRACE("IP version: %u\n", iph->version); - return 0; - } - - /* - * Does our datagram fit inside the skb? - */ - if (unlikely(tot_len > len)) { - DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE); - return 0; - } - - /* - * Do we have a non-initial fragment? - */ - frag_off = ntohs(iph->frag_off); - if (unlikely(frag_off & IP_OFFSET)) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT); - DEBUG_TRACE("non-initial fragment\n"); - return 0; - } - - /* - * If we have a (first) fragment then mark it to cause any connection to flush. - */ - sync_on_find = unlikely(frag_off & IP_MF) ? true : false; - - /* - * Do we have any IP options? That's definite a slow path! If we do have IP - * options we need to recheck our header size. - */ - ihl = iph->ihl << 2; - ip_options = unlikely(ihl != sizeof(struct iphdr)) ? true : false; - if (unlikely(ip_options)) { - if (unlikely(len < ihl)) { - - DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE); - return 0; - } - - sync_on_find = true; - } - - /* - * Handle PPPoE bridge packets using 3-tuple acceleration if SFE_PPPOE_BR_ACCEL_MODE_EN_3T - */ - if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS)) && - unlikely(sfe_pppoe_get_br_accel_mode() == SFE_PPPOE_BR_ACCEL_MODE_EN_3T)) { - struct ethhdr *eth = eth_hdr(skb); - if (!sfe_pppoe_mgr_find_session(l2_info->pppoe_session_id, eth->h_source)) { - return sfe_ipv4_recv_pppoe_bridge(si, skb, dev, len, iph, ihl, l2_info); - } - } - - protocol = iph->protocol; - if (IPPROTO_UDP == protocol) { - return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, sync_on_find, l2_info, tun_outer); - } - - if (IPPROTO_TCP == protocol) { - return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, sync_on_find, l2_info); - } - - if (IPPROTO_ESP == protocol) { - return sfe_ipv4_recv_esp(si, skb, dev, len, iph, ihl, sync_on_find, tun_outer); - } - - if (IPPROTO_ICMP == protocol) { - return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl); - } - -#ifdef SFE_GRE_TUN_ENABLE - if (IPPROTO_GRE == protocol) { - return sfe_ipv4_recv_gre(si, skb, dev, len, iph, ihl, sync_on_find, l2_info, tun_outer); - } -#endif - if (IPPROTO_IPV6 == protocol) { - return sfe_ipv4_recv_tun6rd(si, skb, dev, len, iph, ihl, sync_on_find, l2_info, true); - } - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL); - - DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol); - return 0; -} - -static void -sfe_ipv4_update_tcp_state(struct sfe_ipv4_connection *c, - struct sfe_ipv4_rule_create_msg *msg) -{ - struct sfe_ipv4_connection_match *orig_cm; - struct sfe_ipv4_connection_match *repl_cm; - struct sfe_ipv4_tcp_connection_match *orig_tcp; - struct sfe_ipv4_tcp_connection_match *repl_tcp; - - orig_cm = c->original_match; - repl_cm = c->reply_match; - orig_tcp = &orig_cm->protocol_state.tcp; - repl_tcp = &repl_cm->protocol_state.tcp; - - /* update orig */ - if (orig_tcp->max_win < msg->tcp_rule.flow_max_window) { - orig_tcp->max_win = msg->tcp_rule.flow_max_window; - } - if ((s32)(orig_tcp->end - msg->tcp_rule.flow_end) < 0) { - orig_tcp->end = msg->tcp_rule.flow_end; - } - if ((s32)(orig_tcp->max_end - msg->tcp_rule.flow_max_end) < 0) { - orig_tcp->max_end = msg->tcp_rule.flow_max_end; - } - - /* update reply */ - if (repl_tcp->max_win < msg->tcp_rule.return_max_window) { - repl_tcp->max_win = msg->tcp_rule.return_max_window; - } - if ((s32)(repl_tcp->end - msg->tcp_rule.return_end) < 0) { - repl_tcp->end = msg->tcp_rule.return_end; - } - if ((s32)(repl_tcp->max_end - msg->tcp_rule.return_max_end) < 0) { - repl_tcp->max_end = msg->tcp_rule.return_max_end; - } - - /* update match flags */ - orig_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - repl_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK) { - - orig_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - repl_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - } -} - -static void -sfe_ipv4_update_protocol_state(struct sfe_ipv4_connection *c, - struct sfe_ipv4_rule_create_msg *msg) -{ - switch (msg->tuple.protocol) { - case IPPROTO_TCP: - sfe_ipv4_update_tcp_state(c, msg); - break; - } -} - -/* - * sfe_ipv4_match_entry_set_vlan() - */ -static void sfe_ipv4_match_entry_set_vlan( - struct sfe_ipv4_connection_match *cm, - u32 primary_ingress_vlan_tag, - u32 primary_egress_vlan_tag, - u32 secondary_ingress_vlan_tag, - u32 secondary_egress_vlan_tag) -{ - u16 tpid; - /* - * Prevent stacking header counts when updating. - */ - cm->ingress_vlan_hdr_cnt = 0; - cm->egress_vlan_hdr_cnt = 0; - memset(cm->ingress_vlan_hdr, 0, sizeof(cm->ingress_vlan_hdr)); - memset(cm->egress_vlan_hdr, 0, sizeof(cm->egress_vlan_hdr)); - - /* - * vlan_hdr[0] corresponds to outer tag - * vlan_hdr[1] corresponds to inner tag - * Extract the vlan information (tpid and tci) from rule message - */ - if ((primary_ingress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(primary_ingress_vlan_tag >> 16); - cm->ingress_vlan_hdr[0].tpid = ntohs(tpid); - cm->ingress_vlan_hdr[0].tci = (u16)primary_ingress_vlan_tag; - cm->ingress_vlan_hdr_cnt++; - } - - if ((secondary_ingress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(secondary_ingress_vlan_tag >> 16); - cm->ingress_vlan_hdr[1].tpid = ntohs(tpid); - cm->ingress_vlan_hdr[1].tci = (u16)secondary_ingress_vlan_tag; - cm->ingress_vlan_hdr_cnt++; - } - - if ((primary_egress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(primary_egress_vlan_tag >> 16); - cm->egress_vlan_hdr[0].tpid = ntohs(tpid); - cm->egress_vlan_hdr[0].tci = (u16)primary_egress_vlan_tag; - cm->egress_vlan_hdr_cnt++; - } - - if ((secondary_egress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(secondary_egress_vlan_tag >> 16); - cm->egress_vlan_hdr[1].tpid = ntohs(tpid); - cm->egress_vlan_hdr[1].tci = (u16)secondary_egress_vlan_tag; - cm->egress_vlan_hdr_cnt++; - } -} - -void sfe_ipv4_update_rule(struct sfe_ipv4_rule_create_msg *msg) -{ - struct sfe_ipv4_connection *c; - struct sfe_ipv4 *si = &__si; - - spin_lock_bh(&si->lock); - - c = sfe_ipv4_find_connection(si, - msg->tuple.protocol, - msg->tuple.flow_ip, - msg->tuple.flow_ident, - msg->tuple.return_ip, - msg->tuple.return_ident); - if (c != NULL) { - sfe_ipv4_update_protocol_state(c, msg); - } - - spin_unlock_bh(&si->lock); -} - -/* - * sfe_ipv4_mark_rule_update() - * Updates the mark values of match entries. - */ -void sfe_ipv4_mark_rule_update(struct sfe_connection_mark *mark) -{ - struct sfe_ipv4_connection *c; - struct sfe_ipv4 *si = &__si; - - spin_lock_bh(&si->lock); - c = sfe_ipv4_find_connection(si, mark->protocol, - mark->src_ip[0], - mark->src_port, - mark->dest_ip[0], - mark->dest_port); - if (!c) { - spin_unlock_bh(&si->lock); - DEBUG_WARN("%px: connection not found for mark update\n", mark); - return; - } - c->original_match->mark = mark->mark; - c->reply_match->mark = mark->mark; - spin_unlock_bh(&si->lock); - DEBUG_TRACE("%px: connection mark updated with %d\n", mark, mark->mark); -} -EXPORT_SYMBOL(sfe_ipv4_mark_rule_update); - -/* - * sfe_ipv4_xmit_eth_type_check() - * Checking if MAC header has to be written. - */ -static inline bool sfe_ipv4_xmit_eth_type_check(struct net_device *dev, u32 cm_flags) -{ - if (!(dev->flags & IFF_NOARP)) { - return true; - } - - /* - * For PPPoE, since we are now supporting PPPoE encapsulation, we are writing L2 header. - */ - if (unlikely(cm_flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - return true; - } - - return false; -} - -/* - * sfe_ipv4_service_class_stats_pcpu_get() - * Gets one CPU's service class statistics. - */ -static inline bool sfe_ipv4_service_class_stats_pcpu_get(struct sfe_ipv4_per_service_class_stats *sc_stats, uint64_t *bytes, uint64_t *packets) -{ - uint32_t retries = 0; - uint32_t seq; - uint64_t bytes_tmp, packets_tmp; - - do { - seq = read_seqcount_begin(&sc_stats->seq); - bytes_tmp = sc_stats->tx_bytes; - packets_tmp = sc_stats->tx_packets; - } while (read_seqcount_retry(&sc_stats->seq, seq) && ++retries < SFE_SERVICE_CLASS_STATS_MAX_RETRY); - - *bytes += bytes_tmp; - *packets += packets_tmp; - - return retries < SFE_SERVICE_CLASS_STATS_MAX_RETRY; -} - -/* - * sfe_ipv4_service_class_stats_get() - * Copy the ipv4 statistics for the given service class. - */ -bool sfe_ipv4_service_class_stats_get(uint8_t sid, uint64_t *bytes, uint64_t *packets) -{ - struct sfe_ipv4 *si = &__si; - uint32_t cpu = 0; - - for_each_possible_cpu(cpu) { - struct sfe_ipv4_service_class_stats_db *stats_db = per_cpu_ptr(si->stats_pcpu_psc, cpu); - struct sfe_ipv4_per_service_class_stats *sc_stats = &stats_db->psc_stats[sid]; - - if (!sfe_ipv4_service_class_stats_pcpu_get(sc_stats, bytes, packets)) { - return false; - } - } - - return true; -} - -/* - * sfe_ipv4_create_rule() - * Create a forwarding rule. - */ -int sfe_ipv4_create_rule(struct sfe_ipv4_rule_create_msg *msg) -{ - struct sfe_ipv4 *si = &__si; - struct sfe_ipv4_connection *c, *c_old; - struct sfe_ipv4_connection_match *original_cm; - struct sfe_ipv4_connection_match *reply_cm; - struct net_device *dest_dev; - struct net_device *src_dev; - struct sfe_ipv4_5tuple *tuple = &msg->tuple; - s32 flow_interface_num = msg->conn_rule.flow_top_interface_num; - s32 return_interface_num = msg->conn_rule.return_top_interface_num; - struct net *net; - struct sock *sk; - unsigned int src_if_idx; - u32 flow_sawf_tag; - u32 return_sawf_tag; - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE) { - flow_interface_num = msg->conn_rule.flow_interface_num; - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE) { - return_interface_num = msg->conn_rule.return_interface_num; - } - - src_dev = dev_get_by_index(&init_net, flow_interface_num); - if (!src_dev) { - DEBUG_WARN("%px: Unable to find src_dev corresponding to %d\n", msg, - flow_interface_num); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - return -EINVAL; - } - - dest_dev = dev_get_by_index(&init_net, return_interface_num); - if (!dest_dev) { - DEBUG_WARN("%px: Unable to find dest_dev corresponding to %d\n", msg, - return_interface_num); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - dev_put(src_dev); - return -EINVAL; - } - - if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || - (src_dev->reg_state != NETREG_REGISTERED))) { - dev_put(src_dev); - dev_put(dest_dev); - DEBUG_WARN("%px: src_dev=%s and dest_dev=%s are unregistered\n", msg, - src_dev->name, dest_dev->name); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - return -EINVAL; - } - - /* - * Allocate the various connection tracking objects. - */ - c = (struct sfe_ipv4_connection *)kzalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC); - if (unlikely(!c)) { - DEBUG_WARN("%px: memory allocation of connection entry failed\n", msg); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - dev_put(src_dev); - dev_put(dest_dev); - return -ENOMEM; - } - - original_cm = (struct sfe_ipv4_connection_match *)kzalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); - if (unlikely(!original_cm)) { - DEBUG_WARN("%px: memory allocation of connection match entry failed\n", msg); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - return -ENOMEM; - } - - reply_cm = (struct sfe_ipv4_connection_match *)kzalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); - if (unlikely(!reply_cm)) { - DEBUG_WARN("%px: memory allocation of connection match entry failed\n", msg); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - return -ENOMEM; - } - - this_cpu_inc(si->stats_pcpu->connection_create_requests64); - - spin_lock_bh(&si->lock); - - /* - * Check to see if there is already a flow that matches the rule we're - * trying to create. If there is then we can't create a new one. - */ - c_old = sfe_ipv4_find_connection(si, - msg->tuple.protocol, - msg->tuple.flow_ip, - msg->tuple.flow_ident, - msg->tuple.return_ip, - msg->tuple.return_ident); - - if (c_old != NULL) { - this_cpu_inc(si->stats_pcpu->connection_create_collisions64); - - /* - * If we already have the flow then it's likely that this - * request to create the connection rule contains more - * up-to-date information. Check and update accordingly. - */ - sfe_ipv4_update_protocol_state(c, msg); - spin_unlock_bh(&si->lock); - - kfree(reply_cm); - kfree(original_cm); - kfree(c); - - dev_put(src_dev); - dev_put(dest_dev); - - DEBUG_TRACE("%px: connection already exists - p:%d\n" - " s: %s:%pM:%pI4:%u, d: %s:%pM:%pI4:%u\n", - msg, tuple->protocol, - src_dev->name, msg->conn_rule.flow_mac, &tuple->flow_ip, ntohs(tuple->flow_ident), - dest_dev->name, msg->conn_rule.return_mac, &tuple->return_ip, ntohs(tuple->return_ident)); - - return -EADDRINUSE; - } - - /* - * Fill in the "original" direction connection matching object. - * Note that the transmit MAC address is "dest_mac_xlate" because - * we always know both ends of a connection by their translated - * addresses and not their public addresses. - */ - original_cm->match_dev = src_dev; - original_cm->match_protocol = tuple->protocol; - original_cm->match_src_ip = tuple->flow_ip; - original_cm->match_src_port = netif_is_vxlan(src_dev) ? 0 : tuple->flow_ident; - original_cm->match_dest_ip = tuple->return_ip; - original_cm->match_dest_port = tuple->return_ident; - - original_cm->xlate_src_ip = msg->conn_rule.flow_ip_xlate; - original_cm->xlate_src_port = msg->conn_rule.flow_ident_xlate; - original_cm->xlate_dest_ip = msg->conn_rule.return_ip_xlate; - original_cm->xlate_dest_port = msg->conn_rule.return_ident_xlate; - - if (tuple->protocol == IPPROTO_GRE) { - /* - * the PPTP is 4 tuple lookup. - * During th rule lookup destination call id from packet - * is matched against destination port in cm. - */ - original_cm->match_src_port = 0; - original_cm->xlate_src_port = 0; - } - - original_cm->xmit_dev = dest_dev; - original_cm->xmit_dev_mtu = msg->conn_rule.return_mtu; - - original_cm->connection = c; - original_cm->counter_match = reply_cm; - - /* - * UDP Socket is valid only in decap direction. - */ - RCU_INIT_POINTER(original_cm->up, NULL); - - if (msg->valid_flags & SFE_RULE_CREATE_MARK_VALID) { - original_cm->mark = msg->mark_rule.flow_mark; - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_MARK; - } - if (msg->valid_flags & SFE_RULE_CREATE_QOS_VALID) { - original_cm->priority = msg->qos_rule.flow_qos_tag; - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; - } - if (msg->valid_flags & SFE_RULE_CREATE_DSCP_MARKING_VALID) { - original_cm->dscp = msg->dscp_rule.flow_dscp << SFE_IPV4_DSCP_SHIFT; - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_FLOW_TRANSMIT_FAST) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION; - } - - /* - * Mark SAWF metadata if the sawf tag is valid and set. - */ - original_cm->sawf_valid = false; - flow_sawf_tag = SFE_GET_SAWF_TAG(msg->sawf_rule.flow_mark); - if (likely(SFE_SAWF_TAG_IS_VALID(flow_sawf_tag))) { - original_cm->mark = msg->sawf_rule.flow_mark; - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_MARK; - original_cm->sawf_valid = true; - } - - /* - * Add VLAN rule to original_cm - */ - if (msg->valid_flags & SFE_RULE_CREATE_VLAN_VALID) { - struct sfe_vlan_rule *vlan_primary_rule = &msg->vlan_primary_rule; - struct sfe_vlan_rule *vlan_secondary_rule = &msg->vlan_secondary_rule; - sfe_ipv4_match_entry_set_vlan(original_cm, - vlan_primary_rule->ingress_vlan_tag, - vlan_primary_rule->egress_vlan_tag, - vlan_secondary_rule->ingress_vlan_tag, - vlan_secondary_rule->egress_vlan_tag); - - if ((msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE) && - original_cm->egress_vlan_hdr_cnt > 0) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG; - original_cm->l2_hdr_size += original_cm->egress_vlan_hdr_cnt * VLAN_HLEN; - } - } - - if (((IPPROTO_GRE == tuple->protocol) || (IPPROTO_ESP == tuple->protocol)) && - !sfe_ipv4_is_local_ip(si, original_cm->match_dest_ip)) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH; - } - -#ifdef CONFIG_NF_FLOW_COOKIE - original_cm->flow_cookie = 0; -#endif -#ifdef CONFIG_XFRM - if (msg->valid_flags & SFE_RULE_CREATE_DIRECTION_VALID) { - original_cm->flow_accel = msg->direction_rule.flow_accel; - } else { - original_cm->flow_accel = 1; - } -#endif - - /* - * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan, - * bottom interfaces are expected to be disabled in the flow rule and always top interfaces - * are used. In such cases, do not use HW csum offload. csum offload is used only when we - * are sending directly to the destination interface that supports it. - */ - if (likely(dest_dev->features & NETIF_F_HW_CSUM) && sfe_dev_has_hw_csum(dest_dev)) { - if ((msg->conn_rule.return_top_interface_num == msg->conn_rule.return_interface_num) || - (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE)) { - - /* - * Dont enable CSUM offload - */ -#if 0 - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD; -#endif - } - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_FLOW_SRC_INTERFACE_CHECK) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK; - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_FLOW_SRC_INTERFACE_CHECK_NO_FLUSH) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH; - } - - /* - * Adding PPPoE parameters to original and reply entries based on the direction where - * PPPoE header is valid in ECM rule. - * - * If PPPoE is valid in flow direction (from interface is PPPoE), then - * original cm will have PPPoE at ingress (strip PPPoE header) - * reply cm will have PPPoE at egress (add PPPoE header) - * - * If PPPoE is valid in return direction (to interface is PPPoE), then - * original cm will have PPPoE at egress (add PPPoE header) - * reply cm will have PPPoE at ingress (strip PPPoE header) - */ - if (msg->valid_flags & SFE_RULE_CREATE_PPPOE_DECAP_VALID) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP; - original_cm->pppoe_session_id = msg->pppoe_rule.flow_pppoe_session_id; - ether_addr_copy(original_cm->pppoe_remote_mac, msg->pppoe_rule.flow_pppoe_remote_mac); - - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP; - reply_cm->l2_hdr_size += PPPOE_SES_HLEN; - reply_cm->pppoe_session_id = msg->pppoe_rule.flow_pppoe_session_id; - ether_addr_copy(reply_cm->pppoe_remote_mac, msg->pppoe_rule.flow_pppoe_remote_mac); - } - - if (msg->valid_flags & SFE_RULE_CREATE_PPPOE_ENCAP_VALID) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP; - original_cm->l2_hdr_size += PPPOE_SES_HLEN; - original_cm->pppoe_session_id = msg->pppoe_rule.return_pppoe_session_id; - ether_addr_copy(original_cm->pppoe_remote_mac, msg->pppoe_rule.return_pppoe_remote_mac); - - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP; - reply_cm->pppoe_session_id = msg->pppoe_rule.return_pppoe_session_id; - ether_addr_copy(reply_cm->pppoe_remote_mac, msg->pppoe_rule.return_pppoe_remote_mac); - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_RETURN_SRC_INTERFACE_CHECK) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK; - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_RETURN_SRC_INTERFACE_CHECK_NO_FLUSH) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH; - } - - /* - * For the non-arp interface, we don't write L2 HDR. - */ - if (sfe_ipv4_xmit_eth_type_check(dest_dev, original_cm->flags)) { - - /* - * Check whether the rule has configured a specific source MAC address to use. - * This is needed when virtual L3 interfaces such as br-lan, macvlan, vlan are used during egress - */ - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - ether_addr_copy((u8 *)original_cm->xmit_src_mac, (u8 *)msg->conn_rule.flow_mac); - } else { - if ((msg->valid_flags & SFE_RULE_CREATE_SRC_MAC_VALID) && - (msg->src_mac_rule.mac_valid_flags & SFE_SRC_MAC_RETURN_VALID)) { - ether_addr_copy((u8 *)original_cm->xmit_src_mac, (u8 *)msg->src_mac_rule.return_src_mac); - } else { - ether_addr_copy((u8 *)original_cm->xmit_src_mac, (u8 *)dest_dev->dev_addr); - } - } - - ether_addr_copy((u8 *)original_cm->xmit_dest_mac, (u8 *)msg->conn_rule.return_mac); - - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; - original_cm->l2_hdr_size += ETH_HLEN; - - /* - * If our dev writes Ethernet headers then we can write a really fast - * version. - */ - if (dest_dev->header_ops) { - if (dest_dev->header_ops->create == eth_header) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; - } - } - } - - /* - * Fill in the "reply" direction connection matching object. - */ - reply_cm->match_dev = dest_dev; - reply_cm->match_protocol = tuple->protocol; - reply_cm->match_src_ip = msg->conn_rule.return_ip_xlate; - - /* - * Keep source port as 0 for VxLAN tunnels. - */ - if (netif_is_vxlan(src_dev) || netif_is_vxlan(dest_dev)) { - reply_cm->match_src_port = 0; - } else { - reply_cm->match_src_port = msg->conn_rule.return_ident_xlate; - } - - reply_cm->match_dest_ip = msg->conn_rule.flow_ip_xlate; - reply_cm->match_dest_port = msg->conn_rule.flow_ident_xlate; - - reply_cm->xlate_src_ip = tuple->return_ip; - reply_cm->xlate_src_port = tuple->return_ident; - reply_cm->xlate_dest_ip = tuple->flow_ip; - reply_cm->xlate_dest_port = tuple->flow_ident; - - if (tuple->protocol == IPPROTO_GRE) { - /* - * the PPTP is 4 tuple lookup. - * During th rule lookup destination call id from packet - * is matched against destination port in cm. - */ - reply_cm->match_src_port = 0; - reply_cm->xlate_src_port = 0; - } - - reply_cm->xmit_dev = src_dev; - reply_cm->xmit_dev_mtu = msg->conn_rule.flow_mtu; - - reply_cm->connection = c; - reply_cm->counter_match = original_cm; - - if (msg->valid_flags & SFE_RULE_CREATE_MARK_VALID) { - reply_cm->mark = msg->mark_rule.return_mark; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_MARK; - } - if (msg->valid_flags & SFE_RULE_CREATE_QOS_VALID) { - reply_cm->priority = msg->qos_rule.return_qos_tag; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; - } - - if (msg->valid_flags & SFE_RULE_CREATE_DSCP_MARKING_VALID) { - reply_cm->dscp = msg->dscp_rule.return_dscp << SFE_IPV4_DSCP_SHIFT; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_RETURN_TRANSMIT_FAST) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION; - } - - if (((IPPROTO_GRE == tuple->protocol) || (IPPROTO_ESP == tuple->protocol)) && - !sfe_ipv4_is_local_ip(si, reply_cm->match_dest_ip)) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH; - } - - /* - * Mark SAWF metadata in reply match if the sawf tag is valid. - */ - reply_cm->sawf_valid = false; - return_sawf_tag = SFE_GET_SAWF_TAG(msg->sawf_rule.return_mark); - if (likely(SFE_SAWF_TAG_IS_VALID(return_sawf_tag))) { - reply_cm->mark = msg->sawf_rule.return_mark; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_MARK; - reply_cm->sawf_valid = true; - } - - /* - * Setup UDP Socket if found to be valid for decap. - */ - RCU_INIT_POINTER(reply_cm->up, NULL); - net = dev_net(reply_cm->match_dev); - src_if_idx = src_dev->ifindex; - - rcu_read_lock(); - - /* - * Look for the associated sock object. - * __udp4_lib_lookup() holds a reference for this sock object, - * which will be released in sfe_ipv4_free_connection_rcu() - */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - sk = __udp4_lib_lookup(net, reply_cm->xlate_src_ip, reply_cm->xlate_src_port, - reply_cm->match_dest_ip, reply_cm->match_dest_port, src_if_idx, &udp_table); -#else - sk = __udp4_lib_lookup(net, reply_cm->xlate_src_ip, reply_cm->xlate_src_port, - reply_cm->match_dest_ip, reply_cm->match_dest_port, src_if_idx, 0, &udp_table, NULL); -#endif - - rcu_read_unlock(); - - /* - * We set the UDP sock pointer as valid only for decap direction. - */ - if (sk && udp_sk(sk)->encap_type) { -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - if (!atomic_add_unless(&sk->sk_refcnt, 1, 0)) { -#else - if (!refcount_inc_not_zero(&sk->sk_refcnt)) { -#endif - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - - DEBUG_TRACE("%px: sfe: unable to take reference for socket(%px) p:%d\n" - " s: %s:%pM:%pI4:%u, d: %s:%pM:%pI4:%u\n", - msg, sk, tuple->protocol, - src_dev->name, msg->conn_rule.flow_mac, &tuple->flow_ip, ntohs(tuple->flow_ident), - dest_dev->name, msg->conn_rule.return_mac, &tuple->return_ip, ntohs(tuple->return_ident)); - - dev_put(src_dev); - dev_put(dest_dev); - - return -ESHUTDOWN; - } - - rcu_assign_pointer(reply_cm->up, udp_sk(sk)); - - DEBUG_INFO("%px: Sock(%px) lookup success with reply_cm direction\n", msg, sk); - DEBUG_INFO("%px: SFE connection -\n" - " s: %s:%pI4(%pI4):%u(%u)\n" - " d: %s:%pI4(%pI4):%u(%u)\n", - msg, reply_cm->match_dev->name, &reply_cm->match_src_ip, &reply_cm->xlate_src_ip, - ntohs(reply_cm->match_src_port), ntohs(reply_cm->xlate_src_port), - reply_cm->xmit_dev->name, &reply_cm->match_dest_ip, &reply_cm->xlate_dest_ip, - ntohs(reply_cm->match_dest_port), ntohs(reply_cm->xlate_dest_port)); - } - - /* - * Add VLAN rule to reply_cm - */ - if (msg->valid_flags & SFE_RULE_CREATE_VLAN_VALID) { - struct sfe_vlan_rule *vlan_primary_rule = &msg->vlan_primary_rule; - struct sfe_vlan_rule *vlan_secondary_rule = &msg->vlan_secondary_rule; - sfe_ipv4_match_entry_set_vlan(reply_cm, - vlan_primary_rule->egress_vlan_tag, - vlan_primary_rule->ingress_vlan_tag, - vlan_secondary_rule->egress_vlan_tag, - vlan_secondary_rule->ingress_vlan_tag); - - if ((msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE) && - reply_cm->egress_vlan_hdr_cnt > 0) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG; - reply_cm->l2_hdr_size += reply_cm->egress_vlan_hdr_cnt * VLAN_HLEN; - } - } - - /* - * the net_protocol handler will be used only in decap path - * for non passthrough case. - */ - original_cm->proto = NULL; - reply_cm->proto = NULL; - original_cm->top_interface_dev = NULL; - reply_cm->top_interface_dev = NULL; - -#ifdef SFE_GRE_TUN_ENABLE - if ((IPPROTO_GRE == tuple->protocol) && !(reply_cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH)) { - rcu_read_lock(); - reply_cm->proto = rcu_dereference(inet_protos[IPPROTO_GRE]); - rcu_read_unlock(); - - if (unlikely(!reply_cm->proto)) { - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - DEBUG_WARN("sfe: GRE proto handler is not registered\n"); - return -EPERM; - } - } -#endif - - if (IPPROTO_IPV6 == tuple->protocol) { - original_cm->proto = NULL; - rcu_read_lock(); - reply_cm->proto = rcu_dereference(inet_protos[IPPROTO_IPV6]); - rcu_read_unlock(); - reply_cm->top_interface_dev = dev_get_by_index(&init_net, msg->conn_rule.return_top_interface_num); - - if (unlikely(!reply_cm->top_interface_dev)) { - DEBUG_WARN("%px: Unable to find top_interface_dev corresponding to %d\n", msg, - msg->conn_rule.return_top_interface_num); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - return -EINVAL; - } - } - - if ((IPPROTO_ESP == tuple->protocol) && !(reply_cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH)) { - rcu_read_lock(); - reply_cm->proto = rcu_dereference(inet_protos[IPPROTO_ESP]); - rcu_read_unlock(); - - if (unlikely(!reply_cm->proto)) { - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - DEBUG_WARN("sfe: ESP proto handler is not registered\n"); - return -EPERM; - } - } - -#ifdef CONFIG_NF_FLOW_COOKIE - reply_cm->flow_cookie = 0; -#endif -#ifdef CONFIG_XFRM - if (msg->valid_flags & SFE_RULE_CREATE_DIRECTION_VALID) { - reply_cm->flow_accel = msg->direction_rule.return_accel; - } else { - reply_cm->flow_accel = 1; - } - -#endif - /* - * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan, - * bottom interfaces are expected to be disabled in the flow rule and always top interfaces - * are used. In such cases, do not use HW csum offload. csum offload is used only when we - * are sending directly to the destination interface that supports it. - */ - if (likely(src_dev->features & NETIF_F_HW_CSUM) && sfe_dev_has_hw_csum(src_dev)) { - if ((msg->conn_rule.flow_top_interface_num == msg->conn_rule.flow_interface_num) || - (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE)) { - /* - * Dont enable CSUM offload - */ -#if 0 - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD; -#endif - } - } - - /* - * For the non-arp interface, we don't write L2 HDR. - */ - if (sfe_ipv4_xmit_eth_type_check(src_dev, reply_cm->flags)) { - - /* - * Check whether the rule has configured a specific source MAC address to use. - * This is needed when virtual L3 interfaces such as br-lan, macvlan, vlan are used during egress - */ - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - ether_addr_copy((u8 *)reply_cm->xmit_src_mac, (u8 *)msg->conn_rule.return_mac); - } else { - if ((msg->valid_flags & SFE_RULE_CREATE_SRC_MAC_VALID) && - (msg->src_mac_rule.mac_valid_flags & SFE_SRC_MAC_FLOW_VALID)) { - ether_addr_copy((u8 *)reply_cm->xmit_src_mac, (u8 *)msg->src_mac_rule.flow_src_mac); - } else { - ether_addr_copy((u8 *)reply_cm->xmit_src_mac, (u8 *)src_dev->dev_addr); - } - } - - ether_addr_copy((u8 *)reply_cm->xmit_dest_mac, (u8 *)msg->conn_rule.flow_mac); - - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; - reply_cm->l2_hdr_size += ETH_HLEN; - - /* - * If our dev writes Ethernet headers then we can write a really fast - * version. - */ - if (src_dev->header_ops) { - if (src_dev->header_ops->create == eth_header) { - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; - } - } - } - - if ((tuple->return_ip != msg->conn_rule.return_ip_xlate) || - (tuple->return_ident != msg->conn_rule.return_ident_xlate)) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; - } - - if ((tuple->flow_ip != msg->conn_rule.flow_ip_xlate) || - (tuple->flow_ident != msg->conn_rule.flow_ident_xlate)) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; - } - - /* - * Initialize the protocol-specific information that we track. - */ - switch (tuple->protocol) { - case IPPROTO_TCP: - original_cm->protocol_state.tcp.win_scale = msg->tcp_rule.flow_window_scale; - original_cm->protocol_state.tcp.max_win = msg->tcp_rule.flow_max_window ? msg->tcp_rule.flow_max_window : 1; - original_cm->protocol_state.tcp.end = msg->tcp_rule.flow_end; - original_cm->protocol_state.tcp.max_end = msg->tcp_rule.flow_max_end; - - reply_cm->protocol_state.tcp.win_scale = msg->tcp_rule.return_window_scale; - reply_cm->protocol_state.tcp.max_win = msg->tcp_rule.return_max_window ? msg->tcp_rule.return_max_window : 1; - reply_cm->protocol_state.tcp.end = msg->tcp_rule.return_end; - reply_cm->protocol_state.tcp.max_end = msg->tcp_rule.return_max_end; - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK) { - original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - } - break; - - case IPPROTO_RAW: - /* - * Set src_port to 0 to avoid hash collision in connection match lookups. - */ - original_cm->match_src_port = 0; - original_cm->xlate_src_port = 0; - reply_cm->match_src_port = 0; - reply_cm->xlate_src_port = 0; - break; - } - - /* - * Fill in the ipv4_connection object. - */ - c->protocol = tuple->protocol; - c->src_ip = tuple->flow_ip; - c->src_ip_xlate = msg->conn_rule.flow_ip_xlate; - c->src_port = tuple->flow_ident; - c->src_port_xlate = msg->conn_rule.flow_ident_xlate; - c->original_dev = src_dev; - c->original_match = original_cm; - c->dest_ip = tuple->return_ip; - c->dest_ip_xlate = msg->conn_rule.return_ip_xlate; - c->dest_port = tuple->return_ident; - c->dest_port_xlate = msg->conn_rule.return_ident_xlate; - c->reply_dev = dest_dev; - c->reply_match = reply_cm; - c->debug_read_seq = 0; - c->last_sync_jiffies = get_jiffies_64(); - c->removed = false; - - sfe_ipv4_connection_match_compute_translations(original_cm); - sfe_ipv4_connection_match_compute_translations(reply_cm); - sfe_ipv4_insert_connection(si, c); - - spin_unlock_bh(&si->lock); - - /* - * We have everything we need! - */ - DEBUG_INFO("%px: NEW connection - p: %d\n" - "original_cm: match_dev=src_dev: %s %d %pM\n" - " xmit_dev=dest_dev: %s %d %pM\n" - " xmit_src_mac: %pM\n" - " xmit_dest_mac: %pM\n" - " flags: %x l2_hdr: %u\n" - "flow_ip: %pI4:%u\n" - "flow_ip_xlate: %pI4:%u\n" - "flow_mac: %pM\n" - "reply_cm: match_dev=dest_dev: %s %d %pM\n" - " xmit_dev=src_dev: %s %d %pM\n" - " xmit_src_mac: %pM\n" - " xmit_dest_mac: %pM\n" - " flags: %x l2_hdr: %u\n" - "return_ip: %pI4:%u\n" - "return_ip_xlate: %pI4:%u\n" - "return_mac: %pM\n" - "flags: valid=%x src_mac_valid=%x\n", - c, tuple->protocol, - original_cm->match_dev->name, original_cm->match_dev->ifindex, original_cm->match_dev->dev_addr, - original_cm->xmit_dev->name, original_cm->xmit_dev->ifindex, original_cm->xmit_dev->dev_addr, - original_cm->xmit_src_mac, original_cm->xmit_dest_mac, original_cm->flags, original_cm->l2_hdr_size, - &tuple->flow_ip, ntohs(tuple->flow_ident), - &msg->conn_rule.flow_ip_xlate, ntohs(msg->conn_rule.flow_ident_xlate), - msg->conn_rule.flow_mac, - reply_cm->match_dev->name, reply_cm->match_dev->ifindex, reply_cm->match_dev->dev_addr, - reply_cm->xmit_dev->name, reply_cm->xmit_dev->ifindex, reply_cm->xmit_dev->dev_addr, - reply_cm->xmit_src_mac, reply_cm->xmit_dest_mac, reply_cm->flags, reply_cm->l2_hdr_size, - &tuple->return_ip, ntohs(tuple->return_ident), - &msg->conn_rule.return_ip_xlate, ntohs(msg->conn_rule.return_ident_xlate), - msg->conn_rule.return_mac, - msg->valid_flags, msg->src_mac_rule.mac_valid_flags); - - return 0; -} - -/* - * sfe_ipv4_destroy_rule() - * Destroy a forwarding rule. - */ -void sfe_ipv4_destroy_rule(struct sfe_ipv4_rule_destroy_msg *msg) -{ - struct sfe_ipv4 *si = &__si; - struct sfe_ipv4_connection *c; - bool ret; - struct sfe_ipv4_5tuple *tuple = &msg->tuple; - - this_cpu_inc(si->stats_pcpu->connection_destroy_requests64); - spin_lock_bh(&si->lock); - - /* - * Check to see if we have a flow that matches the rule we're trying - * to destroy. If there isn't then we can't destroy it. - */ - c = sfe_ipv4_find_connection(si, tuple->protocol, tuple->flow_ip, tuple->flow_ident, - tuple->return_ip, tuple->return_ident); - if (!c) { - spin_unlock_bh(&si->lock); - this_cpu_inc(si->stats_pcpu->connection_destroy_misses64); - - DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n", - tuple->protocol, &tuple->flow_ip, ntohs(tuple->flow_ident), - &tuple->return_ip, ntohs(tuple->return_ident)); - return; - } - - /* - * Remove our connection details from the hash tables. - */ - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); - } - - DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n", - tuple->protocol, &tuple->flow_ip, ntohs(tuple->flow_ident), - &tuple->return_ip, ntohs(tuple->return_ident)); -} - -/* - * sfe_ipv4_sync_invoke() - * Schedule many sync stats. - */ -bool sfe_ipv4_sync_invoke(uint16_t index) -{ - struct sfe_ipv4 *si = &__si; - DEBUG_INFO("Request for a sync with index[%d]\n", index); - return schedule_delayed_work_on(si->work_cpu, &(si->sync_dwork), 0); -} - -/* - * sfe_ipv4_register_sync_rule_callback() - * Register a callback for many rule synchronization. - */ -void sfe_ipv4_register_many_sync_callback(sfe_ipv4_many_sync_callback_t cb) -{ - struct sfe_ipv4 *si = &__si; - - spin_lock_bh(&si->lock); - rcu_assign_pointer(si->many_sync_callback, cb); - spin_unlock_bh(&si->lock); -} - -/* - * sfe_ipv4_register_sync_rule_callback() - * Register a callback for rule synchronization. - */ -void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) -{ - struct sfe_ipv4 *si = &__si; - - spin_lock_bh(&si->lock); - rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); - spin_unlock_bh(&si->lock); -} -/* - * sfe_ipv4_get_debug_dev() - */ -static ssize_t sfe_ipv4_get_debug_dev(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ipv4 *si = &__si; - ssize_t count; - int num; - - spin_lock_bh(&si->lock); - num = si->debug_dev; - spin_unlock_bh(&si->lock); - - count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); - return count; -} - -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_ipv4_debug_dev_attr = - __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv4_get_debug_dev, NULL); - -/* - * sfe_ipv4_destroy_all_rules_for_dev() - * Destroy all connections that match a particular device. - * - * If we pass dev as NULL then this destroys all connections. - */ -void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev) -{ - struct sfe_ipv4 *si = &__si; - struct sfe_ipv4_connection *c; - bool ret; - -another_round: - spin_lock_bh(&si->lock); - - for (c = si->all_connections_head; c; c = c->all_connections_next) { - /* - * Does this connection relate to the device we are destroying? - */ - if (!dev - || (dev == c->original_dev) - || (dev == c->reply_dev)) { - break; - } - } - - if (c) { - ret = sfe_ipv4_remove_connection(si, c); - } - - spin_unlock_bh(&si->lock); - - if (c) { - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); - } - goto another_round; - } -} - -/* - * sfe_ipv4_periodic_sync() - */ -static void sfe_ipv4_periodic_sync(struct work_struct *work) -{ - struct sfe_ipv4 *si = container_of((struct delayed_work *)work, struct sfe_ipv4, sync_dwork); - u64 now_jiffies; - int quota,count; - sfe_ipv4_many_sync_callback_t sync_rule_callback; - struct sfe_ipv4_connection *c; - struct sfe_ipv4_conn_sync *conn_sync; - - now_jiffies = get_jiffies_64(); - - rcu_read_lock(); - sync_rule_callback = rcu_dereference(si->many_sync_callback); - rcu_read_unlock(); - if (!sync_rule_callback) { - return; - } - - spin_lock_bh(&si->lock); - - /* - * If we have reached the end of the connection list, walk from - * the connection head. - */ - c = si->wc_next; - if (unlikely(!c)) { - c = si->all_connections_head; - } - - /* - * Get the max number of connections to be put in this sync msg. - */ - quota = sfe_ipv4_sync_max_number; - conn_sync = sfe_ipv4_sync_many_msg->msg.conn_stats_many.conn_sync; - - /* - * Walk the "all connection" list and sync the connection state. - */ - while (likely(c && quota)) { - struct sfe_ipv4_connection_match *cm; - struct sfe_ipv4_connection_match *counter_cm; - struct sfe_connection_sync sis; - - cm = c->original_match; - counter_cm = c->reply_match; - - /* - * Didn't receive packets in the original direction or reply - * direction, move to the next connection. - */ - if ((!atomic_read(&cm->rx_packet_count)) && !(atomic_read(&counter_cm->rx_packet_count))) { - c = c->all_connections_next; - continue; - } - - sfe_ipv4_gen_sync_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); - sfe_ipv4_stats_convert(conn_sync, &sis); - - quota--; - conn_sync++; - c = c->all_connections_next; - } - - /* - * At the end of the sync, put the wc_next to the connection we left. - */ - si->wc_next = c; - spin_unlock_bh(&si->lock); - - count = sfe_ipv4_sync_max_number - quota; - /* - * Tell ecm sync round done if at the end of all connection - * otherwise tell the number in the msg. - */ - if (c == NULL) { - DEBUG_INFO("Synced all connections.\n"); - sfe_ipv4_sync_many_msg->msg.conn_stats_many.next = 0; - } else { - DEBUG_INFO("Some connections left.\n"); - sfe_ipv4_sync_many_msg->msg.conn_stats_many.next = count; - } - DEBUG_INFO("Sync %d connections\n", count); - sfe_ipv4_sync_many_msg->msg.conn_stats_many.count = count; - sfe_ipv4_sync_many_msg->cm.response = SFE_CMN_RESPONSE_ACK; - - sync_rule_callback(sfe_ipv4_sync_many_msg); -} - -#define CHAR_DEV_MSG_SIZE 768 - -/* - * sfe_ipv4_debug_dev_read_start() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - - si->debug_read_seq++; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv4_debug_dev_read_connections_start() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv4_debug_dev_read_connections_connection() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - struct sfe_ipv4_connection *c; - struct sfe_ipv4_connection_match *original_cm; - struct sfe_ipv4_connection_match *reply_cm; - int bytes_read; - int protocol; - struct net_device *src_dev; - __be32 src_ip; - __be32 src_ip_xlate; - __be16 src_port; - __be16 src_port_xlate; - u64 src_rx_packets; - u64 src_rx_bytes; - struct net_device *dest_dev; - __be32 dest_ip; - __be32 dest_ip_xlate; - __be16 dest_port; - __be16 dest_port_xlate; - u64 dest_rx_packets; - u64 dest_rx_bytes; - u64 last_sync_jiffies; - u32 src_mark, dest_mark, src_priority, dest_priority, src_dscp, dest_dscp; - bool original_cm_sawf_valid, reply_cm_sawf_valid; - u32 flow_service_class, return_service_class; - u32 flow_msduq, return_msduq; - u32 packet, byte, original_cm_flags; - u16 pppoe_session_id; - u8 pppoe_remote_mac[ETH_ALEN]; - u32 original_fast_xmit, reply_fast_xmit; -#ifdef CONFIG_NF_FLOW_COOKIE - int src_flow_cookie, dst_flow_cookie; -#endif - - spin_lock_bh(&si->lock); - - for (c = si->all_connections_head; c; c = c->all_connections_next) { - if (c->debug_read_seq < si->debug_read_seq) { - c->debug_read_seq = si->debug_read_seq; - break; - } - } - - /* - * If there were no connections then move to the next state. - */ - if (!c || c->removed) { - spin_unlock_bh(&si->lock); - ws->state++; - return true; - } - - original_cm = c->original_match; - reply_cm = c->reply_match; - - protocol = c->protocol; - src_dev = c->original_dev; - src_ip = c->src_ip; - src_ip_xlate = c->src_ip_xlate; - src_port = c->src_port; - src_port_xlate = c->src_port_xlate; - src_priority = original_cm->priority; - src_dscp = original_cm->dscp >> SFE_IPV4_DSCP_SHIFT; - - sfe_ipv4_connection_match_update_summary_stats(original_cm, &packet, &byte); - sfe_ipv4_connection_match_update_summary_stats(reply_cm, &packet, &byte); - - src_rx_packets = original_cm->rx_packet_count64; - src_rx_bytes = original_cm->rx_byte_count64; - src_mark = original_cm->mark; - original_fast_xmit = (original_cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT); - dest_dev = c->reply_dev; - dest_ip = c->dest_ip; - dest_ip_xlate = c->dest_ip_xlate; - dest_port = c->dest_port; - dest_port_xlate = c->dest_port_xlate; - dest_priority = reply_cm->priority; - dest_dscp = reply_cm->dscp >> SFE_IPV4_DSCP_SHIFT; - dest_rx_packets = reply_cm->rx_packet_count64; - dest_rx_bytes = reply_cm->rx_byte_count64; - dest_mark = reply_cm->mark; - reply_fast_xmit = (reply_cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT); - last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; - original_cm_flags = original_cm->flags; - pppoe_session_id = original_cm->pppoe_session_id; - ether_addr_copy(pppoe_remote_mac, original_cm->pppoe_remote_mac); - original_cm_sawf_valid = original_cm->sawf_valid; - reply_cm_sawf_valid = reply_cm->sawf_valid; - flow_service_class = SFE_GET_SAWF_SERVICE_CLASS(original_cm->mark); - flow_msduq = SFE_GET_SAWF_MSDUQ(original_cm->mark); - return_service_class = SFE_GET_SAWF_SERVICE_CLASS(reply_cm->mark); - return_msduq = SFE_GET_SAWF_MSDUQ(reply_cm->mark); -#ifdef CONFIG_NF_FLOW_COOKIE - src_flow_cookie = original_cm->flow_cookie; - dst_flow_cookie = reply_cm->flow_cookie; -#endif - spin_unlock_bh(&si->lock); - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\tname, - &src_ip, &src_ip_xlate, - ntohs(src_port), ntohs(src_port_xlate), - src_priority, src_dscp, - src_rx_packets, src_rx_bytes, - src_mark, - original_fast_xmit ? "Yes" : "No", - dest_dev->name, - &dest_ip, &dest_ip_xlate, - ntohs(dest_port), ntohs(dest_port_xlate), - dest_priority, dest_dscp, - dest_rx_packets, dest_rx_bytes, - dest_mark, - reply_fast_xmit ? "Yes" : "No", -#ifdef CONFIG_NF_FLOW_COOKIE - src_flow_cookie, dst_flow_cookie, -#endif - last_sync_jiffies); - - if (original_cm_flags &= (SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP | SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "pppoe_session_id=\"%u\" pppoe_server MAC=\"%pM\" ", - pppoe_session_id, pppoe_remote_mac); - } - - if (original_cm_sawf_valid) { - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "flow_service_class=\"%d\" flow_msduq = \"0x%x\" ", - flow_service_class, flow_msduq); - } - - if (reply_cm_sawf_valid) { - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "return_service_class=\"%d\" return_msduq = \"0x%x\" ", - return_service_class, return_msduq); - } - - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "/>\n"); - - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - return true; -} - -/* - * sfe_ipv4_debug_dev_read_connections_end() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv4_debug_dev_read_exceptions_start() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv4_debug_dev_read_exceptions_exception() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int i; - u64 val = 0; - - for_each_possible_cpu(i) { - const struct sfe_ipv4_stats *s = per_cpu_ptr(si->stats_pcpu, i); - val += s->exception_events64[ws->iter_exception]; - } - - if (val) { - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, - "\t\t\n", - sfe_ipv4_exception_events_string[ws->iter_exception], - val); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - } - - ws->iter_exception++; - if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) { - ws->iter_exception = 0; - ws->state++; - } - - return true; -} - -/* - * sfe_ipv4_debug_dev_read_exceptions_end() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv4_debug_dev_read_stats() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - struct sfe_ipv4_stats stats; - unsigned int num_conn; - - sfe_ipv4_update_summary_stats(si, &stats); - - spin_lock_bh(&si->lock); - num_conn = si->num_connections; - spin_unlock_bh(&si->lock); - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", - num_conn, - stats.packets_dropped64, - stats.packets_fast_xmited64, - stats.packets_forwarded64, - stats.packets_not_forwarded64, - stats.connection_create_requests64, - stats.connection_create_collisions64, - stats.connection_create_failures64, - stats.connection_destroy_requests64, - stats.connection_destroy_misses64, - stats.connection_flushes64, - stats.connection_match_hash_hits64, - stats.connection_match_hash_reorders64, - stats.pppoe_encap_packets_forwarded64, - stats.pppoe_decap_packets_forwarded64, - stats.pppoe_bridge_packets_forwarded64, - stats.pppoe_bridge_packets_3tuple_forwarded64); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv4_debug_dev_read_end() - * Generate part of the XML output. - */ -static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * Array of write functions that write various XML elements that correspond to - * our XML output state machine. - */ -static sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = { - sfe_ipv4_debug_dev_read_start, - sfe_ipv4_debug_dev_read_connections_start, - sfe_ipv4_debug_dev_read_connections_connection, - sfe_ipv4_debug_dev_read_connections_end, - sfe_ipv4_debug_dev_read_exceptions_start, - sfe_ipv4_debug_dev_read_exceptions_exception, - sfe_ipv4_debug_dev_read_exceptions_end, - sfe_ipv4_debug_dev_read_stats, - sfe_ipv4_debug_dev_read_end, -}; - -/* - * sfe_ipv4_debug_dev_read() - * Send info to userspace upon read request from user - */ -static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) -{ - char msg[CHAR_DEV_MSG_SIZE]; - int total_read = 0; - struct sfe_ipv4_debug_xml_write_state *ws; - struct sfe_ipv4 *si = &__si; - - ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data; - while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { - if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { - continue; - } - } - - return total_read; -} - -/* - * sfe_ipv4_debug_dev_open() - */ -static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file) -{ - struct sfe_ipv4_debug_xml_write_state *ws; - - ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; - if (!ws) { - ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL); - if (!ws) { - return -ENOMEM; - } - - ws->state = SFE_IPV4_DEBUG_XML_STATE_START; - file->private_data = ws; - } - - return 0; -} - -/* - * sfe_ipv4_debug_dev_release() - */ -static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file) -{ - struct sfe_ipv4_debug_xml_write_state *ws; - - ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; - if (ws) { - /* - * We've finished with our output so free the write state. - */ - kfree(ws); - file->private_data = NULL; - } - - return 0; -} - -/* - * File operations used in the debug char device - */ -static struct file_operations sfe_ipv4_debug_dev_fops = { - .read = sfe_ipv4_debug_dev_read, - .open = sfe_ipv4_debug_dev_open, - .release = sfe_ipv4_debug_dev_release -}; - -#ifdef CONFIG_NF_FLOW_COOKIE -/* - * sfe_register_flow_cookie_cb - * register a function in SFE to let SFE use this function to configure flow cookie for a flow - * - * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE - * can use this function to configure flow cookie for a flow. - * return: 0, success; !=0, fail - */ -int sfe_register_flow_cookie_cb(flow_cookie_set_func_t cb) -{ - struct sfe_ipv4 *si = &__si; - - BUG_ON(!cb); - - if (si->flow_cookie_set_func) { - return -1; - } - - rcu_assign_pointer(si->flow_cookie_set_func, cb); - return 0; -} - -/* - * sfe_unregister_flow_cookie_cb - * unregister function which is used to configure flow cookie for a flow - * - * return: 0, success; !=0, fail - */ -int sfe_unregister_flow_cookie_cb(flow_cookie_set_func_t cb) -{ - struct sfe_ipv4 *si = &__si; - - RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); - return 0; -} - -/* - * sfe_ipv4_get_flow_cookie() - */ -static ssize_t sfe_ipv4_get_flow_cookie(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ipv4 *si = &__si; - return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); -} - -/* - * sfe_ipv4_set_flow_cookie() - */ -static ssize_t sfe_ipv4_set_flow_cookie(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) -{ - struct sfe_ipv4 *si = &__si; - si->flow_cookie_enable = simple_strtol(buf, NULL, 0); - - return size; -} - -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_ipv4_flow_cookie_attr = - __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv4_get_flow_cookie, sfe_ipv4_set_flow_cookie); -#endif /*CONFIG_NF_FLOW_COOKIE*/ - -/* - * sfe_ipv4_get_cpu() - */ -static ssize_t sfe_ipv4_get_cpu(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ipv4 *si = &__si; - return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->work_cpu); -} - -/* - * sfe_ipv4_set_cpu() - */ -static ssize_t sfe_ipv4_set_cpu(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) -{ - struct sfe_ipv4 *si = &__si; - int work_cpu; - work_cpu = simple_strtol(buf, NULL, 0); - if ((work_cpu >= 0) && (work_cpu <= NR_CPUS)) { - si->work_cpu = work_cpu; - } else { - dev_err(dev, "%s is not in valid range[0,%d]", buf, NR_CPUS); - } - return size; -} - -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_ipv4_cpu_attr = - __ATTR(stats_work_cpu, S_IWUSR | S_IRUGO, sfe_ipv4_get_cpu, sfe_ipv4_set_cpu); - -/* - * sfe_ipv4_conn_match_hash_init() - * Initialize conn match hash lists - */ -static void sfe_ipv4_conn_match_hash_init(struct sfe_ipv4 *si, int len) -{ - struct hlist_head *hash_list = si->hlist_conn_match_hash_head; - int i; - - for (i = 0; i < len; i++) { - INIT_HLIST_HEAD(&hash_list[i]); - } -} - -#ifdef SFE_PROCESS_LOCAL_OUT -/* - * sfe_ipv4_local_out() - * Called for packets from ip_local_out() - post encapsulation & other packets - */ -static unsigned int sfe_ipv4_local_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) -{ - struct sfe_l2_info l2_info = {0}; - - DEBUG_TRACE("%px: sfe: sfe_ipv4_local_out hook called.\n", skb); - - if (likely(skb->skb_iif)) { - return sfe_ipv4_recv(skb->dev, skb, &l2_info, true) ? NF_STOLEN : NF_ACCEPT; - } - - return NF_ACCEPT; -} - -/* - * struct nf_hook_ops sfe_ipv4_ops_local_out[] - * Hooks into netfilter local out packet monitoring points. - */ -static struct nf_hook_ops sfe_ipv4_ops_local_out[] __read_mostly = { - - /* - * Local out routing hook is used to monitor packets. - */ - { - .hook = sfe_ipv4_local_out, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_FIRST, - }, -}; -#endif - -/* - * sfe_ipv4_init() - */ -int sfe_ipv4_init(void) -{ - struct sfe_ipv4 *si = &__si; - int result = -1; - - DEBUG_INFO("SFE IPv4 init\n"); - - sfe_ipv4_conn_match_hash_init(si, ARRAY_SIZE(si->hlist_conn_match_hash_head)); - - si->stats_pcpu = alloc_percpu_gfp(struct sfe_ipv4_stats, GFP_KERNEL | __GFP_ZERO); - if (!si->stats_pcpu) { - DEBUG_ERROR("failed to allocate stats memory for sfe_ipv4\n"); - goto exit0; - } - - /* - * Allocate per cpu per service class memory. - */ - si->stats_pcpu_psc = alloc_percpu_gfp(struct sfe_ipv4_service_class_stats_db, - GFP_KERNEL | __GFP_ZERO); - if (!si->stats_pcpu_psc) { - DEBUG_ERROR("failed to allocate per cpu per service clas stats memory\n"); - goto exit1; - } - - /* - * Create sys/sfe_ipv4 - */ - si->sys_ipv4 = kobject_create_and_add("sfe_ipv4", NULL); - if (!si->sys_ipv4) { - DEBUG_ERROR("failed to register sfe_ipv4\n"); - goto exit2; - } - - /* - * Create files, one for each parameter supported by this module. - */ - result = sysfs_create_file(si->sys_ipv4, &sfe_ipv4_debug_dev_attr.attr); - if (result) { - DEBUG_ERROR("failed to register debug dev file: %d\n", result); - goto exit3; - } - - result = sysfs_create_file(si->sys_ipv4, &sfe_ipv4_cpu_attr.attr); - if (result) { - DEBUG_ERROR("failed to register debug dev file: %d\n", result); - goto exit4; - } - -#ifdef CONFIG_NF_FLOW_COOKIE - result = sysfs_create_file(si->sys_ipv4, &sfe_ipv4_flow_cookie_attr.attr); - if (result) { - DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); - goto exit5; - } -#endif /* CONFIG_NF_FLOW_COOKIE */ - -#ifdef SFE_PROCESS_LOCAL_OUT -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - result = nf_register_hooks(sfe_ipv4_ops_local_out, ARRAY_SIZE(sfe_ipv4_ops_local_out)); -#else - result = nf_register_net_hooks(&init_net, sfe_ipv4_ops_local_out, ARRAY_SIZE(sfe_ipv4_ops_local_out)); -#endif - if (result < 0) { - DEBUG_ERROR("can't register nf local out hook: %d\n", result); - goto exit6; - } - DEBUG_INFO("Register nf local out hook success: %d\n", result); -#endif - /* - * Register our debug char device. - */ - result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops); - if (result < 0) { - DEBUG_ERROR("Failed to register chrdev: %d\n", result); - goto exit7; - } - - si->debug_dev = result; - si->work_cpu = WORK_CPU_UNBOUND; - - /* - * Create a work to handle pull message from ecm. - */ - INIT_DELAYED_WORK(&(si->sync_dwork), sfe_ipv4_periodic_sync); - /* - * Allocate a message for stats sync many - */ - sfe_ipv4_sync_many_msg = kzalloc(PAGE_SIZE, GFP_KERNEL); - if(!sfe_ipv4_sync_many_msg) { - goto exit8; - } - - sfe_ipv4_msg_init(sfe_ipv4_sync_many_msg, SFE_SPECIAL_INTERFACE_IPV4, - SFE_TX_CONN_STATS_SYNC_MANY_MSG, - sizeof(struct sfe_ipv4_conn_sync_many_msg), - NULL, - NULL); - sfe_ipv4_sync_max_number = (PAGE_SIZE - sizeof(struct sfe_ipv4_msg)) / sizeof(struct sfe_ipv4_conn_sync); - - spin_lock_init(&si->lock); - return 0; - -exit8: - unregister_chrdev(si->debug_dev, "sfe_ipv4"); - -exit7: -#ifdef SFE_PROCESS_LOCAL_OUT - DEBUG_TRACE("sfe: Unregister local out hook\n"); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - nf_unregister_hooks(sfe_ipv4_ops_local_out, ARRAY_SIZE(sfe_ipv4_ops_local_out)); -#else - nf_unregister_net_hooks(&init_net, sfe_ipv4_ops_local_out, ARRAY_SIZE(sfe_ipv4_ops_local_out)); -#endif -exit6: -#endif -#ifdef CONFIG_NF_FLOW_COOKIE - sysfs_remove_file(si->sys_ipv4, &sfe_ipv4_flow_cookie_attr.attr); - -exit5: -#endif /* CONFIG_NF_FLOW_COOKIE */ - sysfs_remove_file(si->sys_ipv4, &sfe_ipv4_cpu_attr.attr); -exit4: - sysfs_remove_file(si->sys_ipv4, &sfe_ipv4_debug_dev_attr.attr); - -exit3: - kobject_put(si->sys_ipv4); - -exit2: - free_percpu(si->stats_pcpu_psc); - -exit1: - free_percpu(si->stats_pcpu); - -exit0: - return result; -} - -/* - * sfe_ipv4_exit() - */ -void sfe_ipv4_exit(void) -{ - struct sfe_ipv4 *si = &__si; - - DEBUG_INFO("SFE IPv4 exit\n"); - /* - * Destroy all connections. - */ - sfe_ipv4_destroy_all_rules_for_dev(NULL); - - cancel_delayed_work_sync(&si->sync_dwork); - - unregister_chrdev(si->debug_dev, "sfe_ipv4"); - -#ifdef SFE_PROCESS_LOCAL_OUT - DEBUG_TRACE("sfe: Unregister local out hook\n"); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - nf_unregister_hooks(sfe_ipv4_ops_local_out, ARRAY_SIZE(sfe_ipv4_ops_local_out)); -#else - nf_unregister_net_hooks(&init_net, sfe_ipv4_ops_local_out, ARRAY_SIZE(sfe_ipv4_ops_local_out)); -#endif -#endif - -#ifdef CONFIG_NF_FLOW_COOKIE - sysfs_remove_file(si->sys_ipv4, &sfe_ipv4_flow_cookie_attr.attr); -#endif /* CONFIG_NF_FLOW_COOKIE */ - sysfs_remove_file(si->sys_ipv4, &sfe_ipv4_debug_dev_attr.attr); - - sysfs_remove_file(si->sys_ipv4, &sfe_ipv4_cpu_attr.attr); - - kobject_put(si->sys_ipv4); - - free_percpu(si->stats_pcpu); - free_percpu(si->stats_pcpu_psc); -} - -#ifdef CONFIG_NF_FLOW_COOKIE -EXPORT_SYMBOL(sfe_register_flow_cookie_cb); -EXPORT_SYMBOL(sfe_unregister_flow_cookie_cb); -#endif diff --git a/shortcut-fe/sfe_ipv4.h b/shortcut-fe/sfe_ipv4.h deleted file mode 100644 index de11f72b8..000000000 --- a/shortcut-fe/sfe_ipv4.h +++ /dev/null @@ -1,456 +0,0 @@ -/* - * sfe_ipv4.h - * Shortcut forwarding engine header file for IPv4. - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __SFE_IPV4_H -#define __SFE_IPV4_H - -#define SFE_IPV4_DSCP_MASK 0x3 -#define SFE_IPV4_DSCP_SHIFT 2 -#include - -/* - * Specifies the lower bound on ACK numbers carried in the TCP header - */ -#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520 - -/* - * IPv4 TCP connection match additional data. - */ -struct sfe_ipv4_tcp_connection_match { - u8 win_scale; /* Window scale */ - u32 max_win; /* Maximum window size seen */ - u32 end; /* Sequence number of the next byte to send (seq + segment length) */ - u32 max_end; /* Sequence number of the last byte to ack */ -}; - -/* - * Bit flags for IPv4 connection matching entry. - */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) - /* Perform source translation */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) - /* Perform destination translation */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) - /* Ignore TCP sequence numbers */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) - /* Fast Ethernet header write */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) - /* Fast Ethernet header write */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) - /* remark priority of SKB */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) - /* remark DSCP of packet */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD (1<<7) - /* checksum offload.*/ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP (1<<8) - /* Indicates that PPPoE should be decapsulated */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP (1<<9) - /* Indicates that PPPoE should be encapsulated */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW (1<<10) - /* Bridge flow */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_MARK (1<<11) - /* skb mark of the packet */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG (1<<12) - /* Insert VLAN tag */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK (1<<13) - /* Source interface check */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH (1<<14) - /* passthrough flow: encap/decap to be skipped for this flow */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT (1<<15) - /* skb go fast xmit */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED (1<<16) - /* Fast xmit flow checked or not */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION (1<<17) - /* Fast xmit may be possible for this flow, if SFE check passes */ -#define SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH (1<<18) - /* Source interface check but do not flush the connection */ - -/* - * IPv4 connection matching structure. - */ -struct sfe_ipv4_connection_match { - /* - * References to other objects. - */ - struct hlist_node hnode; - - struct sfe_ipv4_connection *connection; - struct sfe_ipv4_connection_match *counter_match; - /* Matches the flow in the opposite direction as the one in *connection */ - /* - * Characteristics that identify flows that match this rule. - */ - struct net_device *match_dev; /* Network device */ - u8 match_protocol; /* Protocol */ - __be32 match_src_ip; /* Source IP address */ - __be32 match_dest_ip; /* Destination IP address */ - __be16 match_src_port; /* Source port/connection ident */ - __be16 match_dest_port; /* Destination port/connection ident */ - - struct udp_sock *up; /* Stores UDP sock information; valid only in decap path */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - const struct net_protocol *proto; /* stores protocol handler; valid only in decap path */ -#else - struct net_protocol *proto; /* stores protocol handler; valid only in decap path */ -#endif - - /* - * Control the operations of the match. - */ - u32 flags; /* Bit flags */ -#ifdef CONFIG_NF_FLOW_COOKIE - u32 flow_cookie; /* used flow cookie, for debug */ -#endif -#ifdef CONFIG_XFRM - u32 flow_accel; /* The flow accelerated or not */ -#endif - - /* - * Connection state that we track once we match. - */ - union { /* Protocol-specific state */ - struct sfe_ipv4_tcp_connection_match tcp; - } protocol_state; - - /* - * VLAN headers - */ - struct sfe_vlan_hdr ingress_vlan_hdr[SFE_MAX_VLAN_DEPTH]; - struct sfe_vlan_hdr egress_vlan_hdr[SFE_MAX_VLAN_DEPTH]; - - /* - * Stats recorded in a sync period. These stats will be added to - * rx_packet_count64/rx_byte_count64 after a sync period. - */ - atomic_t rx_packet_count; - atomic_t rx_byte_count; - - /* - * Packet translation information. - */ - __be32 xlate_src_ip; /* Address after source translation */ - __be16 xlate_src_port; /* Port/connection ident after source translation */ - u16 xlate_src_csum_adjustment; - /* Transport layer checksum adjustment after source translation */ - u16 xlate_src_partial_csum_adjustment; - /* Transport layer pseudo header checksum adjustment after source translation */ - - __be32 xlate_dest_ip; /* Address after destination translation */ - __be16 xlate_dest_port; /* Port/connection ident after destination translation */ - u16 xlate_dest_csum_adjustment; - /* Transport layer checksum adjustment after destination translation */ - u16 xlate_dest_partial_csum_adjustment; - /* Transport layer pseudo header checksum adjustment after destination translation */ - - /* - * QoS information - */ - u32 priority; - u32 dscp; - u32 mark; /* mark for outgoing packet */ - - /* - * Packet transmit information. - */ - struct net_device *xmit_dev; /* Network device on which to transmit */ - unsigned short int xmit_dev_mtu; - /* Interface MTU */ - u16 xmit_dest_mac[ETH_ALEN / 2]; - /* Destination MAC address to use when forwarding */ - u16 xmit_src_mac[ETH_ALEN / 2]; - /* Source MAC address to use when forwarding */ - - u8 ingress_vlan_hdr_cnt; /* Ingress active vlan headers count */ - u8 egress_vlan_hdr_cnt; /* Egress active vlan headers count */ - - /* - * Summary stats. - */ - u64 rx_packet_count64; - u64 rx_byte_count64; - - /* - * PPPoE information - */ - u16 pppoe_session_id; - u8 pppoe_remote_mac[ETH_ALEN]; - - struct net_device *top_interface_dev; /* Used by tun6rd to store decap VLAN netdevice.*/ - - /* - * Size of all needed L2 headers - */ - u16 l2_hdr_size; - - /* - * xmit device's feature - */ - netdev_features_t features; - bool sawf_valid; /* Indicates mark has valid SAWF information */ -}; - -/* - * Per-connection data structure. - */ -struct sfe_ipv4_connection { - struct sfe_ipv4_connection *next; - /* Pointer to the next entry in a hash chain */ - struct sfe_ipv4_connection *prev; - /* Pointer to the previous entry in a hash chain */ - int protocol; /* IP protocol number */ - __be32 src_ip; /* Src IP addr pre-translation */ - __be32 src_ip_xlate; /* Src IP addr post-translation */ - __be32 dest_ip; /* Dest IP addr pre-translation */ - __be32 dest_ip_xlate; /* Dest IP addr post-translation */ - __be16 src_port; /* Src port pre-translation */ - __be16 src_port_xlate; /* Src port post-translation */ - __be16 dest_port; /* Dest port pre-translation */ - __be16 dest_port_xlate; /* Dest port post-translation */ - struct sfe_ipv4_connection_match *original_match; - /* Original direction matching structure */ - struct net_device *original_dev; - /* Original direction source device */ - struct sfe_ipv4_connection_match *reply_match; - /* Reply direction matching structure */ - struct net_device *reply_dev; /* Reply direction source device */ - u64 last_sync_jiffies; /* Jiffies count for the last sync */ - struct sfe_ipv4_connection *all_connections_next; - /* Pointer to the next entry in the list of all connections */ - struct sfe_ipv4_connection *all_connections_prev; - /* Pointer to the previous entry in the list of all connections */ - u32 debug_read_seq; /* sequence number for debug dump */ - bool removed; /* Indicates the connection is removed */ - struct rcu_head rcu; /* delay rcu free */ -}; - -/* - * IPv4 connections and hash table size information. - */ -#define SFE_IPV4_CONNECTION_HASH_SHIFT 12 -#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT) -#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1) - -enum sfe_ipv4_exception_events { - SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION, - SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL, - SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, - SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, - SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, - SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL, - SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, - SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS, - SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, - SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, - SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK, - SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, - SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, - SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, - SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4, - SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL, - SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION, - SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, - SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_HEADER_CSUM_BAD, - SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, - SFE_IPV4_EXCEPTION_EVENT_NON_V4, - SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, - SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, - SFE_IPV4_EXCEPTION_EVENT_NO_HEADROOM, - SFE_IPV4_EXCEPTION_EVENT_INVALID_PPPOE_SESSION, - SFE_IPV4_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING, - SFE_IPV4_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME, - SFE_IPV4_EXCEPTION_EVENT_PPPOE_BR_NOT_IN_CME, - SFE_IPV4_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH, - SFE_IPV4_EXCEPTION_EVENT_INVALID_SRC_IFACE, - SFE_IPV4_EXCEPTION_EVENT_TUN6RD_NO_CONNECTION, - SFE_IPV4_EXCEPTION_EVENT_TUN6RD_NEEDS_FRAGMENTATION, - SFE_IPV4_EXCEPTION_EVENT_TUN6RD_SYNC_ON_FIND, - SFE_IPV4_EXCEPTION_EVENT_GRE_HEADER_INCOMPLETE, - SFE_IPV4_EXCEPTION_EVENT_GRE_NO_CONNECTION, - SFE_IPV4_EXCEPTION_EVENT_GRE_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV4_EXCEPTION_EVENT_GRE_SMALL_TTL, - SFE_IPV4_EXCEPTION_EVENT_GRE_NEEDS_FRAGMENTATION, - SFE_IPV4_EXCEPTION_EVENT_ESP_NO_CONNECTION, - SFE_IPV4_EXCEPTION_EVENT_ESP_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV4_EXCEPTION_EVENT_ESP_NEEDS_FRAGMENTATION, - SFE_IPV4_EXCEPTION_EVENT_ESP_SMALL_TTL, - SFE_IPV4_EXCEPTION_EVENT_LAST -}; - -/* - * per CPU stats - */ -struct sfe_ipv4_stats { - /* - * Stats recorded in a sync period. These stats will be added to - * connection_xxx64 after a sync period. - */ - u64 connection_create_requests64; - /* Number of IPv4 connection create requests */ - u64 connection_create_collisions64; - /* Number of IPv4 connection create requests that collided with existing hash table entries */ - u64 connection_create_failures64; - /* Number of IPv4 connection create requests that failed */ - u64 connection_destroy_requests64; - /* Number of IPv4 connection destroy requests */ - u64 connection_destroy_misses64; - /* Number of IPv4 connection destroy requests that missed our hash table */ - u64 connection_match_hash_hits64; - /* Number of IPv4 connection match hash hits */ - u64 connection_match_hash_reorders64; - /* Number of IPv4 connection match hash reorders */ - u64 connection_flushes64; /* Number of IPv4 connection flushes */ - u64 packets_dropped64; /* Number of IPv4 packets dropped */ - u64 packets_forwarded64; /* Number of IPv4 packets forwarded */ - u64 packets_fast_xmited64; /* Number of IPv4 packets fast transmited */ - u64 packets_not_forwarded64; /* Number of IPv4 packets not forwarded */ - u64 exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST]; - u64 pppoe_encap_packets_forwarded64; /* Number of IPv4 PPPoE encap packets forwarded */ - u64 pppoe_decap_packets_forwarded64; /* Number of IPv4 PPPoE decap packets forwarded */ - u64 pppoe_bridge_packets_forwarded64; /* Number of IPv4 PPPoE bridge packets forwarded */ - u64 pppoe_bridge_packets_3tuple_forwarded64; /* Number of IPv4 PPPoE bridge packets forwarded based on 3-tuple info */ -}; - -/* - * sfe_ipv4_per_service_class_stats - * Per service class stats - */ -struct sfe_ipv4_per_service_class_stats { - u64 tx_bytes; /* Byte count */ - u64 tx_packets; /* Packet count */ - seqcount_t seq; /* seq lock for read/write protection */ - /* - * TODO : add entries to be collected later. - */ -}; - -/* - * sfe_ipv4_service_class_stats_db - * stat entries for each service class. - */ -struct sfe_ipv4_service_class_stats_db { - struct sfe_ipv4_per_service_class_stats psc_stats[SFE_MAX_SERVICE_CLASS_ID]; - /* Per service class stats */ -}; - -/* - * Per-module structure. - */ -struct sfe_ipv4 { - spinlock_t lock; /* Lock for SMP correctness */ - struct sfe_ipv4_connection *all_connections_head; - /* Head of the list of all connections */ - struct sfe_ipv4_connection *all_connections_tail; - /* Tail of the list of all connections */ - unsigned int num_connections; /* Number of connections */ - struct delayed_work sync_dwork; /* Work to sync the statistics */ - unsigned int work_cpu; /* The core to run stats sync on */ - - sfe_sync_rule_callback_t __rcu sync_rule_callback; - sfe_ipv4_many_sync_callback_t __rcu many_sync_callback; - /* Callback function registered by a connection manager for stats syncing */ - struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; - /* Connection hash table */ - - struct hlist_head hlist_conn_match_hash_head[SFE_IPV4_CONNECTION_HASH_SIZE]; - /* Connection match hash table */ - -#ifdef CONFIG_NF_FLOW_COOKIE - struct sfe_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; - /* flow cookie table*/ - flow_cookie_set_func_t flow_cookie_set_func; - /* function used to configure flow cookie in hardware*/ - int flow_cookie_enable; - /* Enable/disable flow cookie at runtime */ -#endif - struct sfe_ipv4_service_class_stats_db __percpu *stats_pcpu_psc; - /* Database to maintain per cpu per service class statistics */ - - struct sfe_ipv4_stats __percpu *stats_pcpu; - /* Per CPU statistics. */ - - struct sfe_ipv4_connection *wc_next; /* Connection list walk pointer for stats sync */ - - /* - * Control state. - */ - struct kobject *sys_ipv4; /* sysfs linkage */ - int debug_dev; /* Major number of the debug char device */ - u32 debug_read_seq; /* sequence number for debug dump */ -}; - -/* - * Enumeration of the XML output. - */ -enum sfe_ipv4_debug_xml_states { - SFE_IPV4_DEBUG_XML_STATE_START, - SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START, - SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, - SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END, - SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START, - SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, - SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END, - SFE_IPV4_DEBUG_XML_STATE_STATS, - SFE_IPV4_DEBUG_XML_STATE_END, - SFE_IPV4_DEBUG_XML_STATE_DONE -}; - -/* - * XML write state. - */ -struct sfe_ipv4_debug_xml_write_state { - enum sfe_ipv4_debug_xml_states state; - /* XML output file state machine state */ - int iter_exception; /* Next exception iterator */ -}; - -typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv4_debug_xml_write_state *ws); - -u16 sfe_ipv4_gen_ip_csum(struct iphdr *iph); -bool sfe_ipv4_service_class_stats_get(uint8_t sid, uint64_t *bytes, uint64_t *packets); -void sfe_ipv4_service_class_stats_inc(struct sfe_ipv4 *si, uint8_t sid, uint64_t bytes); -void sfe_ipv4_exception_stats_inc(struct sfe_ipv4 *si, enum sfe_ipv4_exception_events reason); -bool sfe_ipv4_remove_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c); -void sfe_ipv4_flush_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, sfe_sync_reason_t reason); -void sfe_ipv4_sync_status(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, sfe_sync_reason_t reason); - -struct sfe_ipv4_connection_match * -sfe_ipv4_find_connection_match_rcu(struct sfe_ipv4 *si, struct net_device *dev, u8 protocol, - __be32 src_ip, __be16 src_port, - __be32 dest_ip, __be16 dest_port); - -void sfe_ipv4_exit(void); -int sfe_ipv4_init(void); - -#endif /* __SFE_IPV4_H */ diff --git a/shortcut-fe/sfe_ipv4_esp.c b/shortcut-fe/sfe_ipv4_esp.c deleted file mode 100644 index f0b49412c..000000000 --- a/shortcut-fe/sfe_ipv4_esp.c +++ /dev/null @@ -1,295 +0,0 @@ -/* - * sfe_ipv4_esp.c - * Shortcut forwarding engine - IPv4 ESP implementation - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" -#include "sfe_ipv4_esp.h" - -/* - * sfe_ipv4_recv_esp() - * Handle ESP packet receives and forwarding - */ -int sfe_ipv4_recv_esp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, - bool sync_on_find, bool tun_outer) -{ - struct sfe_ipv4_connection_match *cm; - struct net_device *xmit_dev; - struct net_protocol *ipprot; - netdev_features_t features; - bool passthrough; - bool bridge_flow; - bool fast_xmit; - bool hw_csum; - __be32 src_ip; - __be32 dest_ip; - bool ret; - u8 ttl; - - /* - * Read the IP address from the iphdr, and set the src/dst ports to 0. - */ - src_ip = iph->saddr; - dest_ip = iph->daddr; - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv4_find_ipv4_connection_match_rcu(si, dev, IPPROTO_ESP, src_ip, 0, dest_ip, 0); - } -#else - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_ESP, src_ip, 0, dest_ip, 0); -#endif - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ESP_NO_CONNECTION); - DEBUG_TRACE("no connection found for esp packet\n"); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - struct sfe_ipv4_connection *c = cm->connection; - int ret; - - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("flush on wrong source interface check failure\n"); - return 0; - } - - passthrough = cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH; - bridge_flow = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * If our packet has been marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before exception it to slow path unless - * it is passthrough (packets not directed to DUT) packet. - * TODO: revisit to ensure that pass through traffic is not bypassing firewall for fragmented cases - */ - if (unlikely(sync_on_find) && !passthrough) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ESP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("%px: sfe: sync on find\n", cm); - return 0; - } - - /* - * Check if skb was cloned. If it was, unshare it. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 0; - } - - /* - * Update the iphdr pointer with the unshared skb's data area. - */ - iph = (struct iphdr *)skb->data; - } - - /* - * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable. - */ - hw_csum = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY); - - /* - * proto decap packet. - * Invoke the inet_protocol handler for delivery of the packet. - */ - ipprot = rcu_dereference(cm->proto); - if (likely(ipprot)) { - skb_reset_network_header(skb); - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - ret = ipprot->handler(skb); - if (ret) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - DEBUG_TRACE("ESP handler returned error %u\n", ret); - return 0; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - return 1; - } - - /* - * esp passthrough / ip local out scenarios. - */ - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely(len > cm->xmit_dev_mtu)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ESP_NEEDS_FRAGMENTATION); - DEBUG_TRACE("%px: sfe: larger than MTU\n", cm); - return 0; - } - - /* - * need to ensure that TTL is >=2. - */ - ttl = iph->ttl; - if (!bridge_flow && (ttl < 2) && passthrough) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - DEBUG_TRACE("%px: sfe: TTL too low\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ESP_SMALL_TTL); - return 0; - } - - /* - * decrement TTL by 1. - */ - iph->ttl = (ttl - (u8)(!bridge_flow && !tun_outer)); - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; - } - - /* - * Replace the IP checksum. - */ - if (likely(hw_csum)) { - skb->ip_summed = CHECKSUM_PARTIAL; - } else { - iph->check = sfe_ipv4_gen_ip_csum(iph); - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * write the layer - 2 header. - */ - if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ETH_P_IP, cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = htons(ETH_P_IP); - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - - features = cm->features; - fast_xmit = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit && dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - dev_queue_xmit(skb); - return 1; -} diff --git a/shortcut-fe/sfe_ipv4_esp.h b/shortcut-fe/sfe_ipv4_esp.h deleted file mode 100644 index f889605eb..000000000 --- a/shortcut-fe/sfe_ipv4_esp.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * sfe_ipv4_esp.h - * Shortcut forwarding engine - IPv4 ESP header file - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv4_recv_esp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, unsigned int len, - struct iphdr *iph, unsigned int ihl, bool sync_on_find, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv4_gre.c b/shortcut-fe/sfe_ipv4_gre.c deleted file mode 100644 index ea9a0001f..000000000 --- a/shortcut-fe/sfe_ipv4_gre.c +++ /dev/null @@ -1,390 +0,0 @@ -/* - * sfe_ipv4_gre.c - * Shortcut forwarding engine file for IPv4 GRE - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv4_recv_gre() - * GRE tunnel packet receive and forwarding. - */ -int sfe_ipv4_recv_gre(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, bool sync_on_find, - struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct sfe_ipv4_connection_match *cm; - struct pptp_gre_header *pptp_hdr; - struct gre_base_hdr *gre_hdr; - struct net_device *xmit_dev; - __be16 dest_port = 0; - bool passthrough; - bool bridge_flow; - __be32 dest_ip; - __be32 src_ip; - bool hw_csum; - bool ret; - u8 ttl; - - /* - * Is our packet too short to contain a valid GRE header? - */ - if (unlikely(!pskb_may_pull(skb, sizeof(*gre_hdr) + ihl))) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_GRE_HEADER_INCOMPLETE); - DEBUG_TRACE("packet too short for GRE header\n"); - return 0; - } - - - /* - * Read the source and destination IP address. - */ - src_ip = iph->saddr; - dest_ip = iph->daddr; - - rcu_read_lock(); - - /* - * Look for a connection match with 4 tuple if it is PPTP - */ - gre_hdr = (struct gre_base_hdr *)(skb->data + ihl); - - if ((gre_hdr->protocol == GRE_PROTO_PPP) && likely(pskb_may_pull(skb, (sizeof(*pptp_hdr) - 8) + ihl))) { - pptp_hdr = (struct pptp_gre_header *)(skb->data + ihl); - dest_port = pptp_hdr->call_id; - } - -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_GRE, src_ip, 0, dest_ip, dest_port); - } -#else - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_GRE, src_ip, 0, dest_ip, dest_port); -#endif - - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_GRE_NO_CONNECTION); - DEBUG_INFO("no GRE connection match found dev %s src ip %pI4 dest ip %pI4 port %d\n", dev->name, &src_ip, &dest_ip, ntohs(dest_port)); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv4_connection *c = cm->connection; - int ret; - - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - passthrough = cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PASSTHROUGH; - - /* - * If our packet has been marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before exception it to slow path unless - * it is passthrough (packets not directed to DUT) packet. - * TODO: revisit to ensure that pass through traffic is not bypassing firewall for fragmented cases - */ - if (unlikely(sync_on_find) && !passthrough) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_GRE_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("%px: sfe: sync on find\n", cm); - return 0; - } - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - bridge_flow = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * Does our TTL allow forwarding? - */ - ttl = iph->ttl; - if (!bridge_flow && (ttl < 2) && passthrough) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - DEBUG_TRACE("%px: sfe: TTL too low\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_GRE_SMALL_TTL); - return 0; - } - - /* - * From this point on we're good to modify the packet. - */ - - /* - * Check if skb was cloned. If it was, unshare it. Because - * the data area is going to be written in this path and we don't want to - * change the cloned skb's data section. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 1; - } - - /* - * Update the iph and udph pointers with the unshared skb's data area. - */ - iph = (struct iphdr *)skb->data; - } - - /* - * For PPPoE packets, match server MAC and session id - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP)) { - struct ethhdr *eth; - bool pppoe_match; - - if (unlikely(!sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: PPPoE header not present in packet for PPPoE rule\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING); - return 0; - } - - eth = eth_hdr(skb); - - pppoe_match = (cm->pppoe_session_id == sfe_l2_pppoe_session_id_get(l2_info)) && - ether_addr_equal((u8*)cm->pppoe_remote_mac, (u8 *)eth->h_source); - - if (unlikely(!pppoe_match)) { - DEBUG_TRACE("%px: PPPoE session ID %d and %d or MAC %pM and %pM did not match\n", - skb, cm->pppoe_session_id, sfe_l2_pppoe_session_id_get(l2_info), - cm->pppoe_remote_mac, eth->h_source); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_PPPOE_SESSION); - return 0; - } - - skb->protocol = htons(l2_info->protocol); - this_cpu_inc(si->stats_pcpu->pppoe_decap_packets_forwarded64); - } else if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - - /* - * If packet contains PPPoE header but CME doesn't contain PPPoE flag yet we are exceptioning - * the packet to linux - */ - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: CME doesn't contain PPPoE flag but packet has PPPoE header\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME); - return 0; - - } - - /* - * For bridged flows when packet contains PPPoE header, restore the header back and forward - * to xmit interface - */ - __skb_push(skb, (sizeof(struct pppoe_hdr) + sizeof(struct sfe_ppp_hdr))); - - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_forwarded64); - } - - /* - * protocol handler will be valid only in decap-path. - */ - if (cm->proto) { - struct net_protocol *ipprot = cm->proto; - skb_reset_network_header(skb); - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - skb->fast_forwarded = 1; - - ret = ipprot->handler(skb); - if (ret) { - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - rcu_read_unlock(); - DEBUG_TRACE("GRE handler returned error %u\n", ret); - return 1; - } - - /* - * Update traffic stats - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - rcu_read_unlock(); - return 1; - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely(len > cm->xmit_dev_mtu)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_GRE_NEEDS_FRAGMENTATION); - DEBUG_TRACE("%px: sfe: larger than MTU\n", cm); - return 0; - } - - /* - * Decrement our TTL - */ - iph->ttl = (ttl - (u8)(!bridge_flow && !tun_outer)); - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; - } - - /* - * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable. - */ - hw_csum = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY); - - /* - * Replace the IP checksum. - */ - if (likely(hw_csum)) { - skb->ip_summed = CHECKSUM_PARTIAL; - } else { - iph->check = sfe_ipv4_gen_ip_csum(iph); - } - - /* - * Update traffic stats - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * For PPPoE flows, add PPPoE header before L2 header is added. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - sfe_pppoe_add_header(skb, cm->pppoe_session_id, PPP_IP); - this_cpu_inc(si->stats_pcpu->pppoe_encap_packets_forwarded64); - } - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * For the simple case we write this really fast. - */ - if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR) { - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } else if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), cm->xmit_dest_mac, cm->xmit_src_mac, len); - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - } - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - rcu_read_unlock(); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv4_gre.h b/shortcut-fe/sfe_ipv4_gre.h deleted file mode 100644 index b60ce9bac..000000000 --- a/shortcut-fe/sfe_ipv4_gre.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * sfe_ipv4_gre.h - * Shortcut forwarding engine - IPv4 GRE header file - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -int sfe_ipv4_recv_gre(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, bool sync_on_find, - struct sfe_l2_info *l2_info, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv4_icmp.c b/shortcut-fe/sfe_ipv4_icmp.c deleted file mode 100644 index a7d31528f..000000000 --- a/shortcut-fe/sfe_ipv4_icmp.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * sfe_ipv4_icmp.c - * Shortcut forwarding engine - IPv4 ICMP implementation - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" - -/* - * sfe_ipv4_recv_icmp() - * Handle ICMP packet receives. - * - * ICMP packets aren't handled as a "fast path" and always have us process them - * through the default Linux stack. What we do need to do is look for any errors - * about connections we are handling in the fast path. If we find any such - * connections then we want to flush their state so that the ICMP error path - * within Linux has all of the correct state should it need it. - */ -int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl) -{ - struct icmphdr *icmph; - struct iphdr *icmp_iph; - unsigned int icmp_ihl_words; - unsigned int icmp_ihl; - u32 *icmp_trans_h; - struct udphdr *icmp_udph; - struct tcphdr *icmp_tcph; - __be32 src_ip; - __be32 dest_ip; - __be16 src_port; - __be16 dest_port; - struct sfe_ipv4_connection_match *cm; - struct sfe_ipv4_connection *c; - u32 pull_len = sizeof(struct icmphdr) + ihl; - bool ret; - - /* - * Is our packet too short to contain a valid ICMP header? - */ - len -= ihl; - if (!pskb_may_pull(skb, pull_len)) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE); - - DEBUG_TRACE("packet too short for ICMP header\n"); - return 0; - } - - /* - * We only handle "destination unreachable" and "time exceeded" messages. - */ - icmph = (struct icmphdr *)(skb->data + ihl); - if ((icmph->type != ICMP_DEST_UNREACH) - && (icmph->type != ICMP_TIME_EXCEEDED)) { - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE); - DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type); - return 0; - } - - /* - * Do we have the full embedded IP header? - */ - len -= sizeof(struct icmphdr); - pull_len += sizeof(struct iphdr); - if (!pskb_may_pull(skb, pull_len)) { - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE); - DEBUG_TRACE("Embedded IP header not complete\n"); - return 0; - } - - /* - * Is our embedded IP version wrong? - */ - icmp_iph = (struct iphdr *)(icmph + 1); - if (unlikely(icmp_iph->version != 4)) { - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4); - DEBUG_TRACE("IP version: %u\n", icmp_iph->version); - return 0; - } - - /* - * Do we have the full embedded IP header, including any options? - */ - icmp_ihl_words = icmp_iph->ihl; - icmp_ihl = icmp_ihl_words << 2; - pull_len += icmp_ihl - sizeof(struct iphdr); - if (!pskb_may_pull(skb, pull_len)) { - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE); - DEBUG_TRACE("Embedded header not large enough for IP options\n"); - return 0; - } - - len -= icmp_ihl; - icmp_trans_h = ((u32 *)icmp_iph) + icmp_ihl_words; - - /* - * Handle the embedded transport layer header. - */ - switch (icmp_iph->protocol) { - case IPPROTO_UDP: - /* - * We should have 8 bytes of UDP header - that's enough to identify - * the connection. - */ - pull_len += 8; - if (!pskb_may_pull(skb, pull_len)) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE); - DEBUG_TRACE("Incomplete embedded UDP header\n"); - return 0; - } - - icmp_udph = (struct udphdr *)icmp_trans_h; - src_port = icmp_udph->source; - dest_port = icmp_udph->dest; - break; - - case IPPROTO_TCP: - /* - * We should have 8 bytes of TCP header - that's enough to identify - * the connection. - */ - pull_len += 8; - if (!pskb_may_pull(skb, pull_len)) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE); - DEBUG_TRACE("Incomplete embedded TCP header\n"); - return 0; - } - - icmp_tcph = (struct tcphdr *)icmp_trans_h; - src_port = icmp_tcph->source; - dest_port = icmp_tcph->dest; - break; - - default: - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL); - DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol); - return 0; - } - - src_ip = icmp_iph->saddr; - dest_ip = icmp_iph->daddr; - - rcu_read_lock(); - - /* - * Look for a connection match. Note that we reverse the source and destination - * here because our embedded message contains a packet that was sent in the - * opposite direction to the one in which we just received it. It will have - * been sent on the interface from which we received it though so that's still - * ok to use. - */ - cm = sfe_ipv4_find_connection_match_rcu(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port); - if (unlikely(!cm)) { - - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION); - DEBUG_TRACE("no connection found\n"); - return 0; - } - - /* - * We found a connection so now remove it from the connection list and flush - * its state. - */ - c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION); - return 0; -} diff --git a/shortcut-fe/sfe_ipv4_icmp.h b/shortcut-fe/sfe_ipv4_icmp.h deleted file mode 100644 index df36e58f4..000000000 --- a/shortcut-fe/sfe_ipv4_icmp.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv4_icmp.h - * Shortcut forwarding engine - IPv4 ICMP header file - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl); diff --git a/shortcut-fe/sfe_ipv4_pppoe_br.c b/shortcut-fe/sfe_ipv4_pppoe_br.c deleted file mode 100644 index 6d12853ee..000000000 --- a/shortcut-fe/sfe_ipv4_pppoe_br.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * sfe_ipv4_pppoe_br.c - * Shortcut forwarding engine - IPv4 PPPoE bridge implementation - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_ipv4.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv4_recv_pppoe_bridge() - * Process PPPoE bridge packets using 3-tuple acceleration - * - */ -int sfe_ipv4_recv_pppoe_bridge(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, struct sfe_l2_info *l2_info) -{ - struct sfe_ipv4_connection_match *cm; - u32 service_class_id; - struct net_device *xmit_dev; - int ret; - bool fast_xmit; - netdev_features_t features; - - rcu_read_lock(); - - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_RAW, iph->saddr, 0, iph->daddr, htons(sfe_l2_pppoe_session_id_get(l2_info))); - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_PPPOE_BR_NOT_IN_CME); - DEBUG_TRACE("%px: no connection found in 3-tuple lookup for PPPoE bridge flow\n", skb); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv4_connection *c = cm->connection; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * Restore PPPoE header back - */ - __skb_push(skb, PPPOE_SES_HLEN); - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write an Ethernet header. - */ - if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - /* - * Update service class stats if SAWF is valid. - */ - if (likely(cm->sawf_valid)) { - service_class_id = SFE_GET_SAWF_SERVICE_CLASS(cm->mark); - sfe_ipv4_service_class_stats_inc(si, service_class_id, len); - } - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - features = cm->features; - - fast_xmit = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_3tuple_forwarded64); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit && dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv4_pppoe_br.h b/shortcut-fe/sfe_ipv4_pppoe_br.h deleted file mode 100644 index 149149539..000000000 --- a/shortcut-fe/sfe_ipv4_pppoe_br.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * sfe_ipv4_pppoe_br.h - * Shortcut forwarding engine - IPv4 PPPoE bridge header file - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -int sfe_ipv4_recv_pppoe_bridge(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, struct sfe_l2_info *l2_info); diff --git a/shortcut-fe/sfe_ipv4_tcp.c b/shortcut-fe/sfe_ipv4_tcp.c deleted file mode 100644 index 8ed26fdbc..000000000 --- a/shortcut-fe/sfe_ipv4_tcp.c +++ /dev/null @@ -1,765 +0,0 @@ -/* - * sfe_ipv4_tcp.c - * Shortcut forwarding engine - IPv4 TCP implementation - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv4_process_tcp_option_sack() - * Parse TCP SACK option and update ack according - */ -static bool sfe_ipv4_process_tcp_option_sack(const struct tcphdr *th, const u32 data_offs, - u32 *ack) -{ - u32 length = sizeof(struct tcphdr); - u8 *ptr = (u8 *)th + length; - - /* - * Ignore processing if TCP packet has only TIMESTAMP option. - */ - if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) - && likely(ptr[0] == TCPOPT_NOP) - && likely(ptr[1] == TCPOPT_NOP) - && likely(ptr[2] == TCPOPT_TIMESTAMP) - && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { - return true; - } - - /* - * TCP options. Parse SACK option. - */ - while (length < data_offs) { - u8 size; - u8 kind; - - ptr = (u8 *)th + length; - kind = *ptr; - - /* - * NOP, for padding - * Not in the switch because to fast escape and to not calculate size - */ - if (kind == TCPOPT_NOP) { - length++; - continue; - } - - if (kind == TCPOPT_SACK) { - u32 sack = 0; - u8 re = 1 + 1; - - size = *(ptr + 1); - if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) - || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) - || (size > (data_offs - length))) { - return false; - } - - re += 4; - while (re < size) { - u32 sack_re; - u8 *sptr = ptr + re; - sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; - if (sack_re > sack) { - sack = sack_re; - } - re += TCPOLEN_SACK_PERBLOCK; - } - if (sack > *ack) { - *ack = sack; - } - length += size; - continue; - } - if (kind == TCPOPT_EOL) { - return true; - } - size = *(ptr + 1); - if (size < 2) { - return false; - } - length += size; - } - - return true; -} - -/* - * sfe_ipv4_recv_tcp() - * Handle TCP packet receives and forwarding. - */ -int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, bool sync_on_find, struct sfe_l2_info *l2_info) -{ - struct tcphdr *tcph; - __be32 src_ip; - __be32 dest_ip; - __be16 src_port; - __be16 dest_port; - struct sfe_ipv4_connection_match *cm; - struct sfe_ipv4_connection_match *counter_cm; - u8 ttl; - u32 flags; - u32 service_class_id; - struct net_device *xmit_dev; - bool ret; - bool hw_csum; - bool bridge_flow; - bool fast_xmit; - netdev_features_t features; - - /* - * Is our packet too short to contain a valid TCP header? - */ - if (unlikely(!pskb_may_pull(skb, (sizeof(struct tcphdr) + ihl)))) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE); - DEBUG_TRACE("packet too short for TCP header\n"); - return 0; - } - - /* - * Read the IP address and port information. Read the IP header data first - * because we've almost certainly got that in the cache. We may not yet have - * the TCP header cached though so allow more time for any prefetching. - */ - src_ip = iph->saddr; - dest_ip = iph->daddr; - - tcph = (struct tcphdr *)(skb->data + ihl); - src_port = tcph->source; - dest_port = tcph->dest; - flags = tcp_flag_word(tcph); - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); - } -#else - /* - * 5-tuple lookup for TCP flow. - */ - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); -#endif - if (unlikely(!cm)) { - /* - * We didn't get a connection but as TCP is connection-oriented that - * may be because this is a non-fast connection (not running established). - * For diagnostic purposes we differentiate this here. - */ - if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { - - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS); - DEBUG_TRACE("no connection found - fast flags\n"); - return 0; - } - - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS); - DEBUG_TRACE("no connection found - slow flags: 0x%x\n", - flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv4_connection *c = cm->connection; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - /* - * If our packet has been marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before throw it slow path. - */ - if (unlikely(sync_on_find)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("Sync on find\n"); - return 0; - } - -#ifdef CONFIG_XFRM - /* - * We can't accelerate the flow on this direction, just let it go - * through the slow path. - */ - if (unlikely(!cm->flow_accel)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - return 0; - } -#endif - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - bridge_flow = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * Does our TTL allow forwarding? - */ - if (likely(!bridge_flow)) { - ttl = iph->ttl; - if (unlikely(ttl < 2)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL); - DEBUG_TRACE("TTL too low\n"); - return 0; - } - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("Larger than MTU\n"); - return 0; - } - - /* - * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN - * set is not a fast path packet. - */ - if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP flags: %#x are not fast. %u->%u\n", - htonl(flags), htons(src_port), htons(dest_port)); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS); - return 0; - } - - counter_cm = cm->counter_match; - - /* - * Are we doing sequence number checking? - */ - if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { - u32 seq; - u32 ack; - u32 sack; - u32 data_offs; - u32 end; - u32 left_edge; - u32 scaled_win; - u32 max_end; - - /* - * Is our sequence fully past the right hand edge of the window? - */ - seq = ntohl(tcph->seq); - if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("seq: %u exceeds right edge: %u\n", - seq, cm->protocol_state.tcp.max_end + 1); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE); - return 0; - } - - /* - * Check that our TCP data offset isn't too short. - */ - data_offs = tcph->doff << 2; - if (unlikely(data_offs < sizeof(struct tcphdr))) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS); - return 0; - } - - /* - * Update ACK according to any SACK option. - */ - ack = ntohl(tcph->ack_seq); - sack = ack; - if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP option SACK size is wrong\n"); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK); - return 0; - } - - /* - * Check that our TCP data offset isn't past the end of the packet. - */ - data_offs += sizeof(struct iphdr); - if (unlikely(len < data_offs)) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", - data_offs, len); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS); - return 0; - } - - end = seq + len - data_offs; - - /* - * Is our sequence fully before the left hand edge of the window? - */ - if (unlikely((s32)(end - (cm->protocol_state.tcp.end - - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("seq: %u before left edge: %u\n", - end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE); - return 0; - } - - /* - * Are we acking data that is to the right of what has been sent? - */ - if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("ack: %u exceeds right edge: %u\n", - sack, counter_cm->protocol_state.tcp.end + 1); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE); - return 0; - } - - /* - * Is our ack too far before the left hand edge of the window? - */ - left_edge = counter_cm->protocol_state.tcp.end - - cm->protocol_state.tcp.max_win - - SFE_IPV4_TCP_MAX_ACK_WINDOW - - 1; - if (unlikely((s32)(sack - left_edge) < 0)) { - struct sfe_ipv4_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE); - return 0; - } - - /* - * Have we just seen the largest window size yet for this connection? If yes - * then we need to record the new value. - */ - scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; - scaled_win += (sack - ack); - if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { - cm->protocol_state.tcp.max_win = scaled_win; - } - - /* - * If our sequence and/or ack numbers have advanced then record the new state. - */ - if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { - cm->protocol_state.tcp.end = end; - } - - max_end = sack + scaled_win; - if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { - counter_cm->protocol_state.tcp.max_end = max_end; - } - } - - /* - * Check if skb was cloned. If it was, unshare it. Because - * the data area is going to be written in this path and we don't want to - * change the cloned skb's data section. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 0; - } - - /* - * Update the iph and tcph pointers with the unshared skb's data area. - */ - iph = (struct iphdr *)skb->data; - tcph = (struct tcphdr *)(skb->data + ihl); - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * For PPPoE packets, match server MAC and session id - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP)) { - struct ethhdr *eth; - bool pppoe_match; - - if (unlikely(!sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: PPPoE header not present in packet for PPPoE rule\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING); - return 0; - } - - eth = eth_hdr(skb); - - pppoe_match = (cm->pppoe_session_id == sfe_l2_pppoe_session_id_get(l2_info)) && - ether_addr_equal((u8*)cm->pppoe_remote_mac, (u8 *)eth->h_source); - - if (unlikely(!pppoe_match)) { - DEBUG_TRACE("%px: PPPoE session ID %d and %d or MAC %pM and %pM did not match\n", - skb, cm->pppoe_session_id, sfe_l2_pppoe_session_id_get(l2_info), - cm->pppoe_remote_mac, eth->h_source); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_PPPOE_SESSION); - return 0; - } - - skb->protocol = htons(l2_info->protocol); - this_cpu_inc(si->stats_pcpu->pppoe_decap_packets_forwarded64); - } else if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - - /* - * If packet contains PPPoE header but CME doesn't contain PPPoE flag yet we are exceptioning - * the packet to linux - */ - if (unlikely(!bridge_flow)) { - rcu_read_unlock(); - DEBUG_TRACE("%px: CME doesn't contain PPPoE flag but packet has PPPoE header\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME); - return 0; - } - - /* - * For bridged flows when packet contains PPPoE header, restore the header back and forward - * to xmit interface - */ - __skb_push(skb, PPPOE_SES_HLEN); - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_forwarded64); - } - - /* - * From this point on we're good to modify the packet. - */ - - /* - * For PPPoE flows, add PPPoE header before L2 header is added. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - sfe_pppoe_add_header(skb, cm->pppoe_session_id, PPP_IP); - this_cpu_inc(si->stats_pcpu->pppoe_encap_packets_forwarded64); - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; - } - - /* - * Decrement our TTL. - */ - if (likely(!bridge_flow)) { - iph->ttl = ttl - 1; - } - - /* - * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable. - * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here - * so that HW does not re-calculate/replace the L4 csum - */ - hw_csum = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY); - - /* - * Do we have to perform translations of the source address/port? - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { - u16 tcp_csum; - u32 sum; - - iph->saddr = cm->xlate_src_ip; - tcph->source = cm->xlate_src_port; - - if (unlikely(!hw_csum)) { - tcp_csum = tcph->check; - if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { - sum = tcp_csum + cm->xlate_src_partial_csum_adjustment; - } else { - sum = tcp_csum + cm->xlate_src_csum_adjustment; - } - - sum = (sum & 0xffff) + (sum >> 16); - tcph->check = (u16)sum; - } - } - - /* - * Do we have to perform translations of the destination address/port? - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { - u16 tcp_csum; - u32 sum; - - iph->daddr = cm->xlate_dest_ip; - tcph->dest = cm->xlate_dest_port; - - if (unlikely(!hw_csum)) { - tcp_csum = tcph->check; - if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { - sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment; - } else { - sum = tcp_csum + cm->xlate_dest_csum_adjustment; - } - - sum = (sum & 0xffff) + (sum >> 16); - tcph->check = (u16)sum; - } - } - - /* - * If HW checksum offload is not possible, full L3 checksum and incremental L4 checksum - * are used to update the packet. Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is - * not recalculated further in packet path. - */ - if (likely(hw_csum)) { - skb->ip_summed = CHECKSUM_PARTIAL; - } else { - iph->check = sfe_ipv4_gen_ip_csum(iph); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write an Ethernet header. - */ - if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - - eth->h_proto = skb->protocol; - - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - /* - * Update service class stats if SAWF is valid. - */ - if (likely(cm->sawf_valid)) { - service_class_id = SFE_GET_SAWF_SERVICE_CLASS(cm->mark); - sfe_ipv4_service_class_stats_inc(si, service_class_id, len); - } - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - features = cm->features; - fast_xmit = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit)) { - if (likely(!skb_is_gso(skb))) { - if (likely(dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - } else { - cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT; - DEBUG_TRACE("%px: fast xmit disabled for xmit dev %s", skb, xmit_dev->name); - } - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv4_tcp.h b/shortcut-fe/sfe_ipv4_tcp.h deleted file mode 100644 index 19ebb6490..000000000 --- a/shortcut-fe/sfe_ipv4_tcp.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv4_tcp.h - * Shortcut forwarding engine - IPv4 TCP header file - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, bool sync_on_find, struct sfe_l2_info *l2_info); diff --git a/shortcut-fe/sfe_ipv4_tun6rd.c b/shortcut-fe/sfe_ipv4_tun6rd.c deleted file mode 100644 index 3f33ee9f2..000000000 --- a/shortcut-fe/sfe_ipv4_tun6rd.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * sfe_ipv4_tun6rd.c - * Shortcut forwarding engine file for IPv4 TUN6RD - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv4_recv_tun6rd() - * Handle TUN6RD packet receives and forwarding. - */ -int sfe_ipv4_recv_tun6rd(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, - bool sync_on_find, struct sfe_l2_info *l2_info, bool tun_outer) -{ - __be32 src_ip; - __be32 dest_ip; - __be16 src_port = 0; - __be16 dest_port = 0; - struct sfe_ipv4_connection_match *cm; - - DEBUG_TRACE("%px: sfe: sfe_ipv4_recv_tun6rd called.\n", skb); - - /* - * Read the IP address information. Read the IP header data first - * because we've almost certainly got that in the cache. - */ - src_ip = iph->saddr; - dest_ip = iph->daddr; - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_IPV6, src_ip, src_port, dest_ip, dest_port); - } -#else - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_IPV6, src_ip, src_port, dest_ip, dest_port); -#endif - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TUN6RD_NO_CONNECTION); - DEBUG_TRACE("%px: no tun6rd connection found\n", skb); - return 0; - } - - /* - * If our packet has been marked as "sync on find" we will sync the status - * and forward it to slowpath. - */ - if (unlikely(sync_on_find)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TUN6RD_SYNC_ON_FIND); - DEBUG_TRACE("%px: Sync on find\n", skb); - - return 0; - } - - /* - * If cm->proto is set, it means the decap path. - * Otherwise we forward the packet in encap path. - */ - if(cm->proto) { -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - const struct net_protocol *ipprot = cm->proto; -#else - struct net_protocol *ipprot = cm->proto; -#endif - - /* - * Do we expect an ingress VLAN tag for this flow? - * Note: We will only have ingress tag check in decap direction. - * Here, no modification is needed, we only check tag match between - * vlan hdr stored in cm and l2_info. - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n" - "cm: %u [0]=%x/%x [1]=%x/%x\n" - "l2_info+: %u [0]=%x/%x [1]=%x/%x\n", skb, - cm->ingress_vlan_hdr_cnt, - htons(cm->ingress_vlan_hdr[0].tpid), cm->ingress_vlan_hdr[0].tci, - htons(cm->ingress_vlan_hdr[1].tpid), cm->ingress_vlan_hdr[1].tci, - l2_info->vlan_hdr_cnt, - htons(l2_info->vlan_hdr[0].tpid), l2_info->vlan_hdr[0].tci, - htons(l2_info->vlan_hdr[1].tpid), l2_info->vlan_hdr[1].tci); - return 0; - } - skb_reset_network_header(skb); - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - - /* - * ipprot->handler(skb) will always return 0; - * There is no way to tell whether the packet is dropped later in linux or not. - * Hence here inc the byte/packet count always. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - DEBUG_TRACE("%px: %s decap done \n", skb, __func__); - - /* - * Update top interface for tunnel searching. - */ - skb->dev = cm->top_interface_dev; - ipprot->handler(skb); - return 1; - - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely(len > cm->xmit_dev_mtu)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_TUN6RD_NEEDS_FRAGMENTATION); - DEBUG_TRACE("%px: Larger than mtu\n", skb); - return 0; - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - skb->dev = cm->xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write a header. - */ - if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, cm->xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - } - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * Mark that this packet has been fast forwarded and send it on its way. - */ - skb->fast_forwarded = 1; - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv4_tun6rd.h b/shortcut-fe/sfe_ipv4_tun6rd.h deleted file mode 100644 index bfa7cc0f5..000000000 --- a/shortcut-fe/sfe_ipv4_tun6rd.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv4_tun6rd.h - * Shortcut forwarding engine header file for IPv4 TUN6RD - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv4_recv_tun6rd(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, - bool sync_on_find, struct sfe_l2_info *l2_info, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv4_udp.c b/shortcut-fe/sfe_ipv4_udp.c deleted file mode 100644 index 47c7dbb5c..000000000 --- a/shortcut-fe/sfe_ipv4_udp.c +++ /dev/null @@ -1,616 +0,0 @@ -/* - * sfe_ipv4_udp.c - * Shortcut forwarding engine - IPv4 UDP implementation - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv4.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv4_udp_sk_deliver() - * Deliver the packet to the protocol handler registered with Linux. - * To be called under rcu_read_lock() - * Returns: - * 1 if the packet needs to be passed to Linux. - * 0 if the packet is processed successfully. - * -1 if the packet is dropped in SFE. - */ -static int sfe_ipv4_udp_sk_deliver(struct sk_buff *skb, struct sfe_ipv4_connection_match *cm, unsigned int ihl) -{ - struct udp_sock *up; - struct sock *sk; - int ret; - int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); - struct udphdr *uh; - unsigned short ulen; - - /* - * Call the decap handler for valid encap_rcv handler. - */ - up = rcu_dereference(cm->up); - encap_rcv = READ_ONCE(up->encap_rcv); - if (!encap_rcv) { - DEBUG_ERROR("%px: sfe: Error: up->encap_rcv is NULL\n", skb); - return 1; - } - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - nf_reset(skb); -#else - nf_reset_ct(skb); -#endif - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - sk = (struct sock *)up; - - uh = udp_hdr(skb); - ulen = ntohs(uh->len); - if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) { - DEBUG_TRACE("%px: short packet", skb); - goto except; - } - uh = udp_hdr(skb); - - /* - * Verify checksum before giving to encap_rcv handler function. - * TODO: The following approach is ignorant for UDPLITE for now. - * Instead, consider calling Linux API to do checksum validation. - */ - if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY && skb->ip_summed != CHECKSUM_COMPLETE) && uh->check) { - - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; - - if (skb_checksum_init(skb, IPPROTO_UDP, inet_compute_pseudo)) { - DEBUG_TRACE("%px: checksum initilization failed", skb); - goto except; - } - - if (inet_get_convert_csum(sk)) { - skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); - } - - if (udp_lib_checksum_complete(skb)) { - DEBUG_TRACE("%px: udp checksum validation failed", skb); - goto except; - } - DEBUG_TRACE("%px: sfe: udp checksum verified in s/w correctly.\n", skb); - } - - /* - * At this point, L4 checksum has already been verified and pkt is going - * to Linux's tunnel decap-handler. Setting ip_summed field to CHECKSUM_NONE, - * to ensure that later packet's inner header checksum is validated correctly. - * TODO: Find the fix to set skb->ip_summed = CHECKSUM_NONE; - */ - - /* - * encap_rcv() returns the following value: - * =0 if skb was successfully passed to the encap - * handler or was discarded by it. - * >0 if skb should be passed on to UDP. - * <0 if skb should be resubmitted as proto -N - */ - ret = encap_rcv(sk, skb); - if (unlikely(ret)) { - DEBUG_TRACE("%px: sfe: udp-decap API return error: %d\n", skb, ret); - goto except; - } - - return 0; - -except: - /* - * The packet could be restored with the original L2 Information for L2 - * flow, but it couldn't restore the NATed IP in the packets. - */ - skb_push(skb, ihl); - return 1; -} - -/* - * sfe_ipv4_recv_udp() - * Handle UDP packet receives and forwarding. - */ -int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, - bool sync_on_find, struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct udphdr *udph; - __be32 src_ip; - __be32 dest_ip; - __be16 src_port; - __be16 dest_port; - struct sfe_ipv4_connection_match *cm; - u8 ttl; - u32 service_class_id; - struct net_device *xmit_dev; - bool hw_csum; - int err; - bool bridge_flow; - int ret; - bool fast_xmit; - netdev_features_t features; - - /* - * Is our packet too short to contain a valid UDP header? - */ - if (unlikely(!pskb_may_pull(skb, (sizeof(struct udphdr) + ihl)))) { - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE); - DEBUG_TRACE("%px: packet too short for UDP header\n", skb); - return 0; - } - - /* - * Read the IP address and port information. Read the IP header data first - * because we've almost certainly got that in the cache. We may not yet have - * the UDP header cached though so allow more time for any prefetching. - */ - src_ip = iph->saddr; - dest_ip = iph->daddr; - - udph = (struct udphdr *)(skb->data + ihl); - src_port = udph->source; - dest_port = udph->dest; - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); - } -#else - /* - * 5-tuple lookup for UDP flow. - */ - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); -#endif - if (unlikely(!cm)) { - - /* - * Try a 4-tuple lookup; required for tunnels like vxlan. - */ - cm = sfe_ipv4_find_connection_match_rcu(si, dev, IPPROTO_UDP, src_ip, 0, dest_ip, dest_port); - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION); - DEBUG_TRACE("%px: sfe: no connection found in 4-tuple lookup.\n", skb); - return 0; - } - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv4_connection *c = cm->connection; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv4_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv4_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - /* - * If our packet has been marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before exception it to slow path. - */ - if (unlikely(sync_on_find)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("%px: sync on find\n", cm); - return 0; - } - -#ifdef CONFIG_XFRM - /* - * We can't accelerate the flow on this direction, just let it go - * through the slow path. - */ - if (unlikely(!cm->flow_accel)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - return 0; - } -#endif - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - bridge_flow = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * Does our TTL allow forwarding? - */ - if (likely(!bridge_flow)) { - ttl = iph->ttl; - if (unlikely(ttl < 2)) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - DEBUG_TRACE("%px: sfe: TTL too low\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL); - return 0; - } - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely((len > cm->xmit_dev_mtu) && (!cm->up))) { - sfe_ipv4_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION); - DEBUG_TRACE("%px: sfe: larger than MTU\n", cm); - return 0; - } - - /* - * Check if skb was cloned. If it was, unshare it. Because - * the data area is going to be written in this path and we don't want to - * change the cloned skb's data section. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("%px: Failed to unshare the cloned skb\n", skb); - rcu_read_unlock(); - return 0; - } - - /* - * Update the iph and udph pointers with the unshared skb's data area. - */ - iph = (struct iphdr *)skb->data; - udph = (struct udphdr *)(skb->data + ihl); - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * For PPPoE packets, match server MAC and session id - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_DECAP)) { - struct ethhdr *eth; - bool pppoe_match; - - if (unlikely(!sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: PPPoE header not present in packet for PPPoE rule\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING); - return 0; - } - - eth = eth_hdr(skb); - - pppoe_match = (cm->pppoe_session_id == sfe_l2_pppoe_session_id_get(l2_info)) && - ether_addr_equal((u8*)cm->pppoe_remote_mac, (u8 *)eth->h_source); - - if (unlikely(!pppoe_match)) { - DEBUG_TRACE("%px: PPPoE session ID %d and %d or MAC %pM and %pM did not match\n", - skb, cm->pppoe_session_id, sfe_l2_pppoe_session_id_get(l2_info), - cm->pppoe_remote_mac, eth->h_source); - rcu_read_unlock(); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_INVALID_PPPOE_SESSION); - return 0; - } - - skb->protocol = htons(l2_info->protocol); - this_cpu_inc(si->stats_pcpu->pppoe_decap_packets_forwarded64); - } else if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - - /* - * If packet contains PPPoE header but CME doesn't contain PPPoE flag yet we are exceptioning - * the packet to linux - */ - if (unlikely(!bridge_flow)) { - rcu_read_unlock(); - DEBUG_TRACE("%px: CME doesn't contain PPPoE flag but packet has PPPoE header\n", skb); - sfe_ipv4_exception_stats_inc(si, SFE_IPV4_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME); - return 0; - } - - /* - * For bridged flows when packet contains PPPoE header, restore the header back and forward - * to xmit interface - */ - __skb_push(skb, PPPOE_SES_HLEN); - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_forwarded64); - } - - /* - * From this point on we're good to modify the packet. - */ - - /* - * For PPPoE flows, add PPPoE header before L2 header is added. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - sfe_pppoe_add_header(skb, cm->pppoe_session_id, PPP_IP); - this_cpu_inc(si->stats_pcpu->pppoe_encap_packets_forwarded64); - } - - /* - * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable. - * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here - * so that HW does not re-calculate/replace the L4 csum - */ - hw_csum = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY); - - /* - * Do we have to perform translations of the source address/port? - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { - u16 udp_csum; - - iph->saddr = cm->xlate_src_ip; - udph->source = cm->xlate_src_port; - - /* - * Do we have a non-zero UDP checksum? If we do then we need - * to update it. - */ - if (unlikely(!hw_csum)) { - udp_csum = udph->check; - if (likely(udp_csum)) { - u32 sum; - - if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { - sum = udp_csum + cm->xlate_src_partial_csum_adjustment; - } else { - sum = udp_csum + cm->xlate_src_csum_adjustment; - } - - sum = (sum & 0xffff) + (sum >> 16); - udph->check = (u16)sum; - } - } - } - - /* - * Do we have to perform translations of the destination address/port? - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { - u16 udp_csum; - - iph->daddr = cm->xlate_dest_ip; - udph->dest = cm->xlate_dest_port; - - /* - * Do we have a non-zero UDP checksum? If we do then we need - * to update it. - */ - if (unlikely(!hw_csum)) { - udp_csum = udph->check; - if (likely(udp_csum)) { - u32 sum; - - /* - * TODO: Use a common API for below incremental checksum calculation - * for IPv4/IPv6 UDP/TCP - */ - if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { - sum = udp_csum + cm->xlate_dest_partial_csum_adjustment; - } else { - sum = udp_csum + cm->xlate_dest_csum_adjustment; - } - - sum = (sum & 0xffff) + (sum >> 16); - udph->check = (u16)sum; - } - } - } - - /* - * UDP sock will be valid only in decap-path. - * Call encap_rcv function associated with udp_sock in cm. - */ - if (unlikely(cm->up)) { - /* - * Call decap handler associated with sock. - * Also validates UDP checksum before calling decap handler. - */ - err = sfe_ipv4_udp_sk_deliver(skb, cm, ihl); - if (unlikely(err == 1)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - return 0; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - DEBUG_TRACE("%px: sfe: sfe_ipv4_recv_udp -> encap_rcv done.\n", skb); - return 1; - } - - /* - * Decrement our TTL - * Except when called from hook function in post-decap. - */ - if (likely(!bridge_flow)) { - iph->ttl -= (u8)(!tun_outer); - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; - } - - /* - * If HW checksum offload is not possible, full L3 checksum and incremental L4 checksum - * are used to update the packet. Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is - * not recalculated further in packet path. - */ - if (likely(hw_csum)) { - skb->ip_summed = CHECKSUM_PARTIAL; - } else { - iph->check = sfe_ipv4_gen_ip_csum(iph); - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write an Ethernet header. - */ - if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - /* - * Update service class stats if SAWF is valid. - */ - if (likely(cm->sawf_valid)) { - service_class_id = SFE_GET_SAWF_SERVICE_CLASS(cm->mark); - sfe_ipv4_service_class_stats_inc(si, service_class_id, len); - } - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - features = cm->features; - - fast_xmit = !!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit && dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv4_udp.h b/shortcut-fe/sfe_ipv4_udp.h deleted file mode 100644 index c1922d8cb..000000000 --- a/shortcut-fe/sfe_ipv4_udp.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv4_udp.h - * Shortcut forwarding engine - IPv4 UDP header file - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct iphdr *iph, unsigned int ihl, bool sync_on_find, - struct sfe_l2_info *l2_info, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv6.c b/shortcut-fe/sfe_ipv6.c deleted file mode 100644 index 087e5ae61..000000000 --- a/shortcut-fe/sfe_ipv6.c +++ /dev/null @@ -1,2905 +0,0 @@ -/* - * sfe_ipv6.c - * Shortcut forwarding engine - IPv6 support. - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" -#include "sfe_ipv6_udp.h" -#include "sfe_ipv6_tcp.h" -#include "sfe_ipv6_icmp.h" -#include "sfe_pppoe.h" -#include "sfe_pppoe_mgr.h" -#include "sfe_ipv6_pppoe_br.h" -#include "sfe_ipv6_tunipip6.h" -#include "sfe_ipv6_gre.h" -#include "sfe_ipv6_esp.h" - -#define sfe_ipv6_addr_copy(src, dest) memcpy((void *)(dest), (void *)(src), 16) - -static char *sfe_ipv6_exception_events_string[SFE_IPV6_EXCEPTION_EVENT_LAST] = { - "UDP_HEADER_INCOMPLETE", - "UDP_NO_CONNECTION", - "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "UDP_SMALL_TTL", - "UDP_NEEDS_FRAGMENTATION", - "TCP_HEADER_INCOMPLETE", - "TCP_NO_CONNECTION_SLOW_FLAGS", - "TCP_NO_CONNECTION_FAST_FLAGS", - "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "TCP_SMALL_TTL", - "TCP_NEEDS_FRAGMENTATION", - "TCP_FLAGS", - "TCP_SEQ_EXCEEDS_RIGHT_EDGE", - "TCP_SMALL_DATA_OFFS", - "TCP_BAD_SACK", - "TCP_BIG_DATA_OFFS", - "TCP_SEQ_BEFORE_LEFT_EDGE", - "TCP_ACK_EXCEEDS_RIGHT_EDGE", - "TCP_ACK_BEFORE_LEFT_EDGE", - "ICMP_HEADER_INCOMPLETE", - "ICMP_UNHANDLED_TYPE", - "ICMP_IPV6_HEADER_INCOMPLETE", - "ICMP_IPV6_NON_V6", - "ICMP_IPV6_IP_OPTIONS_INCOMPLETE", - "ICMP_IPV6_UDP_HEADER_INCOMPLETE", - "ICMP_IPV6_TCP_HEADER_INCOMPLETE", - "ICMP_IPV6_UNHANDLED_PROTOCOL", - "ICMP_NO_CONNECTION", - "ICMP_FLUSHED_CONNECTION", - "HEADER_INCOMPLETE", - "BAD_TOTAL_LENGTH", - "NON_V6", - "NON_INITIAL_FRAGMENT", - "DATAGRAM_INCOMPLETE", - "IP_OPTIONS_INCOMPLETE", - "UNHANDLED_PROTOCOL", - "FLOW_COOKIE_ADD_FAIL", - "NO_HEADROOM", - "INVALID_PPPOE_SESSION", - "INCORRECT_PPPOE_PARSING", - "PPPOE_NOT_SET_IN_CME", - "PPPOE_BR_NOT_IN_CME", - "INGRESS_VLAN_TAG_MISMATCH", - "INVALID_SOURCE_INTERFACE", - "TUNIPIP6_HEADER_INCOMPLETE", - "TUNIPIP6_NO_CONNECTION", - "TUNIPIP6_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "TUNIPIP6_SMALL_TTL", - "TUNIPIP6_NEEDS_FRAGMENTATION", - "TUNIPIP6_SYNC_ON_FIND", - "GRE_HEADER_INCOMPLETE", - "GRE_NO_CONNECTION", - "GRE_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "GRE_SMALL_TTL", - "GRE_NEEDS_FRAGMENTATION", - "ESP_NO_CONNECTION", - "ESP_IP_OPTIONS_OR_INITIAL_FRAGMENT", - "ESP_NEEDS_FRAGMENTATION", - "ESP_SMALL_TTL" -}; - -static struct sfe_ipv6 __si6; -struct sfe_ipv6_msg *sfe_ipv6_sync_many_msg; -uint32_t sfe_ipv6_sync_max_number; - -/* - * sfe_ipv6_get_debug_dev() - */ -static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, struct device_attribute *attr, char *buf); - -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_ipv6_debug_dev_attr = - __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv6_get_debug_dev, NULL); - -/* - * sfe_ipv6_get_connection_match_hash() - * Generate the hash used in connection match lookups. - */ -static inline unsigned int sfe_ipv6_get_connection_match_hash(struct net_device *dev, u8 protocol, - struct sfe_ipv6_addr *src_ip, __be16 src_port, - struct sfe_ipv6_addr *dest_ip, __be16 dest_port) -{ - u32 idx, hash = 0; - - for (idx = 0; idx < 4; idx++) { - hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; - } - hash = hash ^ protocol ^ ntohs(src_port ^ dest_port); - return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; -} - -/* - * sfe_ipv6_find_connection_match_rcu() - * Get the IPv6 flow match info that corresponds to a particular 5-tuple. - */ -struct sfe_ipv6_connection_match * -sfe_ipv6_find_connection_match_rcu(struct sfe_ipv6 *si, struct net_device *dev, u8 protocol, - struct sfe_ipv6_addr *src_ip, __be16 src_port, - struct sfe_ipv6_addr *dest_ip, __be16 dest_port) -{ - struct sfe_ipv6_connection_match *cm = NULL; - unsigned int conn_match_idx; - struct hlist_head *lhead; - WARN_ON_ONCE(!rcu_read_lock_held()); - - conn_match_idx = sfe_ipv6_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); - - lhead = &si->hlist_conn_match_hash_head[conn_match_idx]; - - /* - * Hopefully the first entry is the one we want. - */ - hlist_for_each_entry_rcu(cm, lhead, hnode) { - if ((cm->match_dest_port != dest_port) || - (cm->match_src_port != src_port) || - (!sfe_ipv6_addr_equal(cm->match_src_ip, src_ip)) || - (!sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip)) || - (cm->match_protocol != protocol)) { - continue; - } - - this_cpu_inc(si->stats_pcpu->connection_match_hash_hits64); - - break; - - } - - return cm; -} - -/* - * sfe_ipv6_connection_match_update_summary_stats() - * Update the summary stats for a connection match entry. - */ -static inline void sfe_ipv6_connection_match_update_summary_stats(struct sfe_ipv6_connection_match *cm, - u32 *packets, u32 *bytes) - -{ - u32 packet_count, byte_count; - - packet_count = atomic_read(&cm->rx_packet_count); - cm->rx_packet_count64 += packet_count; - atomic_sub(packet_count, &cm->rx_packet_count); - - byte_count = atomic_read(&cm->rx_byte_count); - cm->rx_byte_count64 += byte_count; - atomic_sub(byte_count, &cm->rx_byte_count); - - *packets = packet_count; - *bytes = byte_count; -} - -/* - * sfe_ipv6_connection_match_compute_translations() - * Compute port and address translations for a connection match entry. - */ -static void sfe_ipv6_connection_match_compute_translations(struct sfe_ipv6_connection_match *cm) -{ - u32 diff[9]; - u32 *idx_32; - u16 *idx_16; - - /* - * Before we insert the entry look to see if this is tagged as doing address - * translations. If it is then work out the adjustment that we need to apply - * to the transport checksum. - */ - if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC) { - u32 adj = 0; - u32 carry = 0; - - /* - * Precompute an incremental checksum adjustment so we can - * edit packets in this stream very quickly. The algorithm is from RFC1624. - */ - idx_32 = diff; - *(idx_32++) = cm->match_src_ip[0].addr[0]; - *(idx_32++) = cm->match_src_ip[0].addr[1]; - *(idx_32++) = cm->match_src_ip[0].addr[2]; - *(idx_32++) = cm->match_src_ip[0].addr[3]; - - idx_16 = (u16 *)idx_32; - *(idx_16++) = cm->match_src_port; - *(idx_16++) = ~cm->xlate_src_port; - idx_32 = (u32 *)idx_16; - - *(idx_32++) = ~cm->xlate_src_ip[0].addr[0]; - *(idx_32++) = ~cm->xlate_src_ip[0].addr[1]; - *(idx_32++) = ~cm->xlate_src_ip[0].addr[2]; - *(idx_32++) = ~cm->xlate_src_ip[0].addr[3]; - - /* - * When we compute this fold it down to a 16-bit offset - * as that way we can avoid having to do a double - * folding of the twos-complement result because the - * addition of 2 16-bit values cannot cause a double - * wrap-around! - */ - for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { - u32 w = *idx_32; - adj += carry; - adj += w; - carry = (w > adj); - } - adj += carry; - adj = (adj & 0xffff) + (adj >> 16); - adj = (adj & 0xffff) + (adj >> 16); - cm->xlate_src_csum_adjustment = (u16)adj; - } - - if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST) { - u32 adj = 0; - u32 carry = 0; - - /* - * Precompute an incremental checksum adjustment so we can - * edit packets in this stream very quickly. The algorithm is from RFC1624. - */ - idx_32 = diff; - *(idx_32++) = cm->match_dest_ip[0].addr[0]; - *(idx_32++) = cm->match_dest_ip[0].addr[1]; - *(idx_32++) = cm->match_dest_ip[0].addr[2]; - *(idx_32++) = cm->match_dest_ip[0].addr[3]; - - idx_16 = (u16 *)idx_32; - *(idx_16++) = cm->match_dest_port; - *(idx_16++) = ~cm->xlate_dest_port; - idx_32 = (u32 *)idx_16; - - *(idx_32++) = ~cm->xlate_dest_ip[0].addr[0]; - *(idx_32++) = ~cm->xlate_dest_ip[0].addr[1]; - *(idx_32++) = ~cm->xlate_dest_ip[0].addr[2]; - *(idx_32++) = ~cm->xlate_dest_ip[0].addr[3]; - - /* - * When we compute this fold it down to a 16-bit offset - * as that way we can avoid having to do a double - * folding of the twos-complement result because the - * addition of 2 16-bit values cannot cause a double - * wrap-around! - */ - for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { - u32 w = *idx_32; - adj += carry; - adj += w; - carry = (w > adj); - } - adj += carry; - adj = (adj & 0xffff) + (adj >> 16); - adj = (adj & 0xffff) + (adj >> 16); - cm->xlate_dest_csum_adjustment = (u16)adj; - } -} - -/* - * sfe_ipv6_update_summary_stats() - * Update the summary stats. - */ -static void sfe_ipv6_update_summary_stats(struct sfe_ipv6 *si, struct sfe_ipv6_stats *stats) -{ - int i = 0; - - memset(stats, 0, sizeof(*stats)); - - for_each_possible_cpu(i) { - const struct sfe_ipv6_stats *s = per_cpu_ptr(si->stats_pcpu, i); - - stats->connection_create_requests64 += s->connection_create_requests64; - stats->connection_create_collisions64 += s->connection_create_collisions64; - stats->connection_create_failures64 += s->connection_create_failures64; - stats->connection_destroy_requests64 += s->connection_destroy_requests64; - stats->connection_destroy_misses64 += s->connection_destroy_misses64; - stats->connection_match_hash_hits64 += s->connection_match_hash_hits64; - stats->connection_match_hash_reorders64 += s->connection_match_hash_reorders64; - stats->connection_flushes64 += s->connection_flushes64; - stats->packets_dropped64 += s->packets_dropped64; - stats->packets_forwarded64 += s->packets_forwarded64; - stats->packets_fast_xmited64 += s->packets_fast_xmited64; - stats->packets_not_forwarded64 += s->packets_not_forwarded64; - stats->pppoe_encap_packets_forwarded64 += s->pppoe_encap_packets_forwarded64; - stats->pppoe_decap_packets_forwarded64 += s->pppoe_decap_packets_forwarded64; - stats->pppoe_bridge_packets_forwarded64 += s->pppoe_bridge_packets_forwarded64; - stats->pppoe_bridge_packets_3tuple_forwarded64 += s->pppoe_bridge_packets_3tuple_forwarded64; - } -} - -/* - * sfe_ipv6_insert_connection_match() - * Insert a connection match into the hash. - * - * On entry we must be holding the lock that protects the hash table. - */ -static inline void sfe_ipv6_insert_connection_match(struct sfe_ipv6 *si, - struct sfe_ipv6_connection_match *cm) -{ - unsigned int conn_match_idx - = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, - cm->match_src_ip, cm->match_src_port, - cm->match_dest_ip, cm->match_dest_port); - - lockdep_assert_held(&si->lock); - - hlist_add_head_rcu(&cm->hnode, &si->hlist_conn_match_hash_head[conn_match_idx]); -#ifdef CONFIG_NF_FLOW_COOKIE - if (!si->flow_cookie_enable || !(cm->flags & (SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC | SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST))) - return; - - /* - * Configure hardware to put a flow cookie in packet of this flow, - * then we can accelerate the lookup process when we received this packet. - */ - for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { - struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; - - if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { - sfe_ipv6_flow_cookie_set_func_t func; - - rcu_read_lock(); - func = rcu_dereference(si->flow_cookie_set_func); - if (func) { - if (!func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, - cm->match_dest_ip->addr, cm->match_dest_port, conn_match_idx)) { - entry->match = cm; - cm->flow_cookie = conn_match_idx; - } else { - si->exception_events[SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL]++; - } - } - rcu_read_unlock(); - - break; - } - } -#endif -} - -/* - * sfe_ipv6_remove_connection_match() - * Remove a connection match object from the hash. - */ -static inline void sfe_ipv6_remove_connection_match(struct sfe_ipv6 *si, struct sfe_ipv6_connection_match *cm) -{ - - lockdep_assert_held(&si->lock); -#ifdef CONFIG_NF_FLOW_COOKIE - if (si->flow_cookie_enable) { - /* - * Tell hardware that we no longer need a flow cookie in packet of this flow - */ - unsigned int conn_match_idx; - - for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { - struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; - - if (cm == entry->match) { - sfe_ipv6_flow_cookie_set_func_t func; - - rcu_read_lock(); - func = rcu_dereference(si->flow_cookie_set_func); - if (func) { - func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, - cm->match_dest_ip->addr, cm->match_dest_port, 0); - } - rcu_read_unlock(); - - cm->flow_cookie = 0; - entry->match = NULL; - entry->last_clean_time = jiffies; - break; - } - } - } -#endif - hlist_del_init_rcu(&cm->hnode); - -} - -/* - * sfe_ipv6_get_connection_hash() - * Generate the hash used in connection lookups. - */ -static inline unsigned int sfe_ipv6_get_connection_hash(u8 protocol, struct sfe_ipv6_addr *src_ip, __be16 src_port, - struct sfe_ipv6_addr *dest_ip, __be16 dest_port) -{ - u32 idx, hash = 0; - - for (idx = 0; idx < 4; idx++) { - hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; - } - hash = hash ^ protocol ^ ntohs(src_port) ^ dest_port; - return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; -} - -/* - * sfe_ipv6_find_connection() - * Get the IPv6 connection info that corresponds to a particular 5-tuple. - * - * On entry we must be holding the lock that protects the hash table. - */ -static inline struct sfe_ipv6_connection *sfe_ipv6_find_connection(struct sfe_ipv6 *si, u32 protocol, - struct sfe_ipv6_addr *src_ip, __be16 src_port, - struct sfe_ipv6_addr *dest_ip, __be16 dest_port) -{ - struct sfe_ipv6_connection *c; - - unsigned int conn_idx = sfe_ipv6_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); - - lockdep_assert_held(&si->lock); - c = si->conn_hash[conn_idx]; - - while (c) { - if ((c->src_port == src_port) - && (c->dest_port == dest_port) - && (sfe_ipv6_addr_equal(c->src_ip, src_ip)) - && (sfe_ipv6_addr_equal(c->dest_ip, dest_ip)) - && (c->protocol == protocol)) { - return c; - } - c = c->next; - } - - return NULL; -} - -/* - * sfe_ipv6_insert_connection() - * Insert a connection into the hash. - * - * On entry we must be holding the lock that protects the hash table. - */ -static void sfe_ipv6_insert_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) -{ - struct sfe_ipv6_connection **hash_head; - struct sfe_ipv6_connection *prev_head; - unsigned int conn_idx; - - lockdep_assert_held(&si->lock); - - /* - * Insert entry into the connection hash. - */ - conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, - c->dest_ip, c->dest_port); - hash_head = &si->conn_hash[conn_idx]; - prev_head = *hash_head; - c->prev = NULL; - if (prev_head) { - prev_head->prev = c; - } - - c->next = prev_head; - *hash_head = c; - - /* - * Insert entry into the "all connections" list. - */ - if (si->all_connections_tail) { - c->all_connections_prev = si->all_connections_tail; - si->all_connections_tail->all_connections_next = c; - } else { - c->all_connections_prev = NULL; - si->all_connections_head = c; - } - - si->all_connections_tail = c; - c->all_connections_next = NULL; - si->num_connections++; - - /* - * Insert the connection match objects too. - */ - sfe_ipv6_insert_connection_match(si, c->original_match); - sfe_ipv6_insert_connection_match(si, c->reply_match); -} - -/* - * sfe_ipv6_remove_connection() - * Remove a sfe_ipv6_connection object from the hash. - * - * On entry we must be holding the lock that protects the hash table. - */ -bool sfe_ipv6_remove_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) -{ - - lockdep_assert_held(&si->lock); - if (c->removed) { - DEBUG_ERROR("%px: Connection has been removed already\n", c); - return false; - } - - /* - * dereference the decap direction top_interface_dev - */ - if (c->reply_match->top_interface_dev) { - dev_put(c->reply_match->top_interface_dev); - } - /* - * Remove the connection match objects. - */ - sfe_ipv6_remove_connection_match(si, c->reply_match); - sfe_ipv6_remove_connection_match(si, c->original_match); - - /* - * Unlink the connection. - */ - if (c->prev) { - c->prev->next = c->next; - } else { - unsigned int conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, - c->dest_ip, c->dest_port); - si->conn_hash[conn_idx] = c->next; - } - - if (c->next) { - c->next->prev = c->prev; - } - - /* - * Unlink connection from all_connections list - */ - if (c->all_connections_prev) { - c->all_connections_prev->all_connections_next = c->all_connections_next; - } else { - si->all_connections_head = c->all_connections_next; - } - - if (c->all_connections_next) { - c->all_connections_next->all_connections_prev = c->all_connections_prev; - } else { - si->all_connections_tail = c->all_connections_prev; - } - - /* - * If I am the next sync connection, move the sync to my next or head. - */ - if (unlikely(si->wc_next == c)) { - si->wc_next = c->all_connections_next; - } - - c->removed = true; - si->num_connections--; - return true; -} - -/* - * sfe_ipv6_gen_sync_connection() - * Sync a connection. - * - * On entry to this function we expect that the lock for the connection is either - * already held (while called from sfe_ipv6_periodic_sync() or isn't required - * (while called from sfe_ipv6_flush_sfe_ipv6_connection()) - */ -static void sfe_ipv6_gen_sync_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c, - struct sfe_connection_sync *sis, sfe_sync_reason_t reason, - u64 now_jiffies) -{ - struct sfe_ipv6_connection_match *original_cm; - struct sfe_ipv6_connection_match *reply_cm; - u32 packet_count, byte_count; - - /* - * Fill in the update message. - */ - sis->is_v6 = 1; - sis->protocol = c->protocol; - sis->src_ip.ip6[0] = c->src_ip[0]; - sis->src_ip_xlate.ip6[0] = c->src_ip_xlate[0]; - sis->dest_ip.ip6[0] = c->dest_ip[0]; - sis->dest_ip_xlate.ip6[0] = c->dest_ip_xlate[0]; - sis->src_port = c->src_port; - sis->src_port_xlate = c->src_port_xlate; - sis->dest_port = c->dest_port; - sis->dest_port_xlate = c->dest_port_xlate; - - original_cm = c->original_match; - reply_cm = c->reply_match; - sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; - sis->src_td_end = original_cm->protocol_state.tcp.end; - sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; - sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; - sis->dest_td_end = reply_cm->protocol_state.tcp.end; - sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; - - sfe_ipv6_connection_match_update_summary_stats(original_cm, &packet_count, &byte_count); - sis->src_new_packet_count = packet_count; - sis->src_new_byte_count = byte_count; - - sfe_ipv6_connection_match_update_summary_stats(reply_cm, &packet_count, &byte_count); - sis->dest_new_packet_count = packet_count; - sis->dest_new_byte_count = byte_count; - - sis->src_dev = original_cm->match_dev; - sis->src_packet_count = original_cm->rx_packet_count64; - sis->src_byte_count = original_cm->rx_byte_count64; - - sis->dest_dev = reply_cm->match_dev; - sis->dest_packet_count = reply_cm->rx_packet_count64; - sis->dest_byte_count = reply_cm->rx_byte_count64; - - sis->reason = reason; - - /* - * Get the time increment since our last sync. - */ - sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; - c->last_sync_jiffies = now_jiffies; -} - -/* - * sfe_ipv6_free_sfe_ipv6_connection_rcu() - * Called at RCU qs state to free the connection object. - */ -static void sfe_ipv6_free_sfe_ipv6_connection_rcu(struct rcu_head *head) -{ - struct sfe_ipv6_connection *c; - struct udp_sock *up; - struct sock *sk; - - /* - * We dont need spin lock as the connection is already removed from link list - */ - c = container_of(head, struct sfe_ipv6_connection, rcu); - BUG_ON(!c->removed); - - DEBUG_TRACE("%px: connecton has been deleted\n", c); - - /* - * Decrease the refcount taken in function sfe_ipv6_create_rule() - * during call of __udp6_lib_lookup() - */ - up = c->reply_match->up; - if (up) { - sk = (struct sock *)up; - sock_put(sk); - } - - /* - * Release our hold of the source and dest devices and free the memory - * for our connection objects. - */ - dev_put(c->original_dev); - dev_put(c->reply_dev); - kfree(c->original_match); - kfree(c->reply_match); - kfree(c); -} - -/* - * sfe_ipv6_sync_status() - * update a connection status to its connection manager. - * - * si: the ipv6 context - * c: which connection to be notified - * reason: what kind of reason: flush, or destroy - */ -void sfe_ipv6_sync_status(struct sfe_ipv6 *si, - struct sfe_ipv6_connection *c, - sfe_sync_reason_t reason) -{ - struct sfe_connection_sync sis; - u64 now_jiffies; - sfe_sync_rule_callback_t sync_rule_callback; - - rcu_read_lock(); - sync_rule_callback = rcu_dereference(si->sync_rule_callback); - rcu_read_unlock(); - if (unlikely(!sync_rule_callback)) { - return; - } - - /* - * Generate a sync message and then sync. - */ - now_jiffies = get_jiffies_64(); - sfe_ipv6_gen_sync_connection(si, c, &sis, reason, now_jiffies); - sync_rule_callback(&sis); -} - -/* - * sfe_ipv6_flush_connection() - * Flush a connection and free all associated resources. - * - * We need to be called with bottom halves disabled locally as we need to acquire - * the connection hash lock and release it again. In general we're actually called - * from within a BH and so we're fine, but we're also called when connections are - * torn down. - */ -void sfe_ipv6_flush_connection(struct sfe_ipv6 *si, - struct sfe_ipv6_connection *c, - sfe_sync_reason_t reason) -{ - BUG_ON(!c->removed); - - this_cpu_inc(si->stats_pcpu->connection_flushes64); - sfe_ipv6_sync_status(si, c, reason); - - /* - * Release our hold of the source and dest devices and free the memory - * for our connection objects. - */ - call_rcu(&c->rcu, sfe_ipv6_free_sfe_ipv6_connection_rcu); -} - -/* - * sfe_ipv4_service_class_stats_pcpu_get() - * Gets one CPU's service class statistics. - */ -static inline bool sfe_ipv6_service_class_stats_pcpu_get(struct sfe_ipv6_per_service_class_stats *sc_stats, uint64_t *bytes, uint64_t *packets) -{ - uint32_t retries = 0; - uint32_t seq; - uint64_t bytes_tmp, packets_tmp; - - do { - seq = read_seqcount_begin(&sc_stats->seq); - bytes_tmp = sc_stats->tx_bytes; - packets_tmp = sc_stats->tx_packets; - } while (read_seqcount_retry(&sc_stats->seq, seq) && ++retries < SFE_SERVICE_CLASS_STATS_MAX_RETRY); - - *bytes += bytes_tmp; - *packets += packets_tmp; - - return retries < SFE_SERVICE_CLASS_STATS_MAX_RETRY; -} - -/* - * sfe_ipv4_service_class_stats_get() - * Copy the ipv4 statistics for the given service class. - */ -bool sfe_ipv6_service_class_stats_get(uint8_t sid, uint64_t *bytes, uint64_t *packets) -{ - struct sfe_ipv6 *si = &__si6; - uint32_t cpu = 0; - - for_each_possible_cpu(cpu) { - struct sfe_ipv6_service_class_stats_db *stats_db = per_cpu_ptr(si->stats_pcpu_psc, cpu); - struct sfe_ipv6_per_service_class_stats *sc_stats = &stats_db->psc_stats[sid]; - - if (!sfe_ipv6_service_class_stats_pcpu_get(sc_stats, bytes, packets)) { - return false; - } - } - - return true; -} - -/* - * sfe_ipv6_service_class_stats_inc() - * Increment per cpu per service class stats. - */ -void sfe_ipv6_service_class_stats_inc(struct sfe_ipv6 *si, uint8_t sid, uint64_t bytes) -{ - struct sfe_ipv6_service_class_stats_db *sc_stats_db = this_cpu_ptr(si->stats_pcpu_psc); - struct sfe_ipv6_per_service_class_stats *sc_stats = &sc_stats_db->psc_stats[sid]; - - write_seqcount_begin(&sc_stats->seq); - sc_stats->tx_bytes += bytes; - sc_stats->tx_packets++; - write_seqcount_end(&sc_stats->seq); -} - -/* - * sfe_ipv6_exception_stats_inc() - * Increment exception stats. - */ -void sfe_ipv6_exception_stats_inc(struct sfe_ipv6 *si, enum sfe_ipv6_exception_events reason) -{ - struct sfe_ipv6_stats *stats = this_cpu_ptr(si->stats_pcpu); - - stats->exception_events64[reason]++; - stats->packets_not_forwarded64++; -} - -/* - * sfe_ipv6_is_local_ip() - * return true if it is local ip otherwise return false - */ -static bool sfe_ipv6_is_local_ip(struct sfe_ipv6 *si, uint8_t *addr) -{ - struct net_device *dev; - struct in6_addr ip_addr; - memcpy(ip_addr.s6_addr, addr, 16); - - dev = ipv6_dev_find(&init_net, &ip_addr, 1); - if (dev) { - dev_put(dev); - return true; - } - - return false; -} - -/* - * sfe_ipv6_recv() - * Handle packet receives and forwaring. - * - * Returns 1 if the packet is forwarded or 0 if it isn't. - */ -int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb, struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct sfe_ipv6 *si = &__si6; - unsigned int len; - unsigned int payload_len; - unsigned int ihl = sizeof(struct ipv6hdr); - bool sync_on_find = false; - struct ipv6hdr *iph; - u8 next_hdr; - - /* - * Check that we have space for an IP header and an uplayer header here. - */ - len = skb->len; - if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE); - DEBUG_TRACE("len: %u is too short\n", len); - return 0; - } - - /* - * Is our IP version wrong? - */ - iph = (struct ipv6hdr *)skb->data; - if (unlikely(iph->version != 6)) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NON_V6); - DEBUG_TRACE("IP version: %u\n", iph->version); - return 0; - } - - /* - * Does our datagram fit inside the skb? - */ - payload_len = ntohs(iph->payload_len); - if (unlikely(payload_len > (len - ihl))) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE); - DEBUG_TRACE("payload_len: %u, exceeds len: %u\n", payload_len, (len - (unsigned int)sizeof(struct ipv6hdr))); - return 0; - } - - next_hdr = iph->nexthdr; - while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { - struct sfe_ipv6_ext_hdr *ext_hdr; - unsigned int ext_hdr_len; - - ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); - - ext_hdr_len = ext_hdr->hdr_len; - ext_hdr_len <<= 3; - ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); - ihl += ext_hdr_len; - if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE); - - DEBUG_TRACE("extension header %d not completed\n", next_hdr); - return 0; - } - /* - * Any packets have extend hdr, won't be handled in the fast - * path,sync its status and exception to the kernel. - */ - sync_on_find = true; - next_hdr = ext_hdr->next_hdr; - } - - /* - * Handle PPPoE bridge packets using 3-tuple acceleration if SFE_PPPOE_BR_ACCEL_MODE_EN_3T - */ - if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS)) && - unlikely(sfe_pppoe_get_br_accel_mode() == SFE_PPPOE_BR_ACCEL_MODE_EN_3T)) { - struct ethhdr *eth = eth_hdr(skb); - if (!sfe_pppoe_mgr_find_session(l2_info->pppoe_session_id, eth->h_source)) { - return sfe_ipv6_recv_pppoe_bridge(si, skb, dev, len, iph, ihl, l2_info); - } - } - - if (IPPROTO_UDP == next_hdr) { - return sfe_ipv6_recv_udp(si, skb, dev, len, iph, ihl, sync_on_find, l2_info, tun_outer); - } - - if (IPPROTO_TCP == next_hdr) { - return sfe_ipv6_recv_tcp(si, skb, dev, len, iph, ihl, sync_on_find, l2_info); - } - - if (IPPROTO_ESP == next_hdr) { - return sfe_ipv6_recv_esp(si, skb, dev, len, iph, ihl, sync_on_find, tun_outer); - } - - if (IPPROTO_ICMPV6 == next_hdr) { - return sfe_ipv6_recv_icmp(si, skb, dev, len, iph, ihl); - } - - if (IPPROTO_IPIP == next_hdr) { - return sfe_ipv6_recv_tunipip6(si, skb, dev, len, iph, ihl, sync_on_find, l2_info, true); - } - -#ifdef SFE_GRE_TUN_ENABLE - if (IPPROTO_GRE == next_hdr) { - return sfe_ipv6_recv_gre(si, skb, dev, len, iph, ihl, sync_on_find, l2_info, tun_outer); - } -#endif - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL); - DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", next_hdr); - return 0; -} - -/* - * sfe_ipv6_update_tcp_state() - * update TCP window variables. - */ -static void -sfe_ipv6_update_tcp_state(struct sfe_ipv6_connection *c, - struct sfe_ipv6_rule_create_msg *msg) -{ - struct sfe_ipv6_connection_match *orig_cm; - struct sfe_ipv6_connection_match *repl_cm; - struct sfe_ipv6_tcp_connection_match *orig_tcp; - struct sfe_ipv6_tcp_connection_match *repl_tcp; - - orig_cm = c->original_match; - repl_cm = c->reply_match; - orig_tcp = &orig_cm->protocol_state.tcp; - repl_tcp = &repl_cm->protocol_state.tcp; - - /* update orig */ - if (orig_tcp->max_win < msg->tcp_rule.flow_max_window) { - orig_tcp->max_win = msg->tcp_rule.flow_max_window; - } - if ((s32)(orig_tcp->end - msg->tcp_rule.flow_end) < 0) { - orig_tcp->end = msg->tcp_rule.flow_end; - } - if ((s32)(orig_tcp->max_end - msg->tcp_rule.flow_max_end) < 0) { - orig_tcp->max_end = msg->tcp_rule.flow_max_end; - } - - /* update reply */ - if (repl_tcp->max_win < msg->tcp_rule.return_max_window) { - repl_tcp->max_win = msg->tcp_rule.return_max_window; - } - if ((s32)(repl_tcp->end - msg->tcp_rule.return_end) < 0) { - repl_tcp->end = msg->tcp_rule.return_end; - } - if ((s32)(repl_tcp->max_end - msg->tcp_rule.return_max_end) < 0) { - repl_tcp->max_end = msg->tcp_rule.return_max_end; - } - - /* update match flags */ - orig_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - repl_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK) { - orig_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - repl_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - } -} - -/* - * sfe_ipv6_update_protocol_state() - * update protocol specified state machine. - */ -static void -sfe_ipv6_update_protocol_state(struct sfe_ipv6_connection *c, - struct sfe_ipv6_rule_create_msg *msg) -{ - switch (msg->tuple.protocol) { - case IPPROTO_TCP: - sfe_ipv6_update_tcp_state(c, msg); - break; - } -} - -/* - * sfe_ipv6_match_entry_set_vlan() - */ -static void sfe_ipv6_match_entry_set_vlan( - struct sfe_ipv6_connection_match *cm, - u32 primary_ingress_vlan_tag, - u32 primary_egress_vlan_tag, - u32 secondary_ingress_vlan_tag, - u32 secondary_egress_vlan_tag) -{ - u16 tpid; - /* - * Prevent stacking header counts when updating. - */ - cm->ingress_vlan_hdr_cnt = 0; - cm->egress_vlan_hdr_cnt = 0; - memset(cm->ingress_vlan_hdr, 0, sizeof(cm->ingress_vlan_hdr)); - memset(cm->egress_vlan_hdr, 0, sizeof(cm->egress_vlan_hdr)); - - /* - * vlan_hdr[0] corresponds to outer tag - * vlan_hdr[1] corresponds to inner tag - * Extract the vlan information (tpid and tci) from rule message - */ - if ((primary_ingress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(primary_ingress_vlan_tag >> 16); - cm->ingress_vlan_hdr[0].tpid = ntohs(tpid); - cm->ingress_vlan_hdr[0].tci = (u16)primary_ingress_vlan_tag; - cm->ingress_vlan_hdr_cnt++; - } - - if ((secondary_ingress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(secondary_ingress_vlan_tag >> 16); - cm->ingress_vlan_hdr[1].tpid = ntohs(tpid); - cm->ingress_vlan_hdr[1].tci = (u16)secondary_ingress_vlan_tag; - cm->ingress_vlan_hdr_cnt++; - } - - if ((primary_egress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(primary_egress_vlan_tag >> 16); - cm->egress_vlan_hdr[0].tpid = ntohs(tpid); - cm->egress_vlan_hdr[0].tci = (u16)primary_egress_vlan_tag; - cm->egress_vlan_hdr_cnt++; - } - - if ((secondary_egress_vlan_tag & VLAN_VID_MASK) != SFE_VLAN_ID_NOT_CONFIGURED) { - tpid = (u16)(secondary_egress_vlan_tag >> 16); - cm->egress_vlan_hdr[1].tpid = ntohs(tpid); - cm->egress_vlan_hdr[1].tci = (u16)secondary_egress_vlan_tag; - cm->egress_vlan_hdr_cnt++; - } -} - -/* - * sfe_ipv6_update_rule() - * update forwarding rule after rule is created. - */ -void sfe_ipv6_update_rule(struct sfe_ipv6_rule_create_msg *msg) - -{ - struct sfe_ipv6_connection *c; - struct sfe_ipv6 *si = &__si6; - - spin_lock_bh(&si->lock); - - c = sfe_ipv6_find_connection(si, - msg->tuple.protocol, - (struct sfe_ipv6_addr *)msg->tuple.flow_ip, - msg->tuple.flow_ident, - (struct sfe_ipv6_addr *)msg->tuple.return_ip, - msg->tuple.return_ident); - if (c != NULL) { - sfe_ipv6_update_protocol_state(c, msg); - } - - spin_unlock_bh(&si->lock); -} - -/* - * sfe_ipv6_mark_rule_update() - * Updates the mark values of match entries. - */ -void sfe_ipv6_mark_rule_update(struct sfe_connection_mark *mark) -{ - struct sfe_ipv6_connection *c; - struct sfe_ipv6 *si = &__si6; - - spin_lock_bh(&si->lock); - c = sfe_ipv6_find_connection(si, mark->protocol, - (struct sfe_ipv6_addr *)mark->src_ip, - mark->src_port, - (struct sfe_ipv6_addr *)mark->dest_ip, - mark->dest_port); - if (!c) { - spin_unlock_bh(&si->lock); - DEBUG_WARN("%px: connection not found for mark update\n", mark); - return; - } - c->original_match ->mark = mark->mark; - c->reply_match->mark = mark->mark; - spin_unlock_bh(&si->lock); - DEBUG_TRACE("%px: connection mark updated with %d\n", mark, mark->mark); -} -EXPORT_SYMBOL(sfe_ipv6_mark_rule_update); - -/* - * sfe_ipv6_xmit_eth_type_check - * Checking if MAC header has to be written. - */ -static inline bool sfe_ipv6_xmit_eth_type_check(struct net_device *dev, u32 cm_flags) -{ - if (!(dev->flags & IFF_NOARP)) { - return true; - } - - /* - * For PPPoE, since we are now supporting PPPoE encapsulation, we are writing L2 header. - */ - if (cm_flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP) { - return true; - } - - return false; -} - -/* - * sfe_ipv6_create_rule() - * Create a forwarding rule. - */ -int sfe_ipv6_create_rule(struct sfe_ipv6_rule_create_msg *msg) -{ - struct sfe_ipv6 *si = &__si6; - struct sfe_ipv6_connection *c, *old_c; - struct sfe_ipv6_connection_match *original_cm; - struct sfe_ipv6_connection_match *reply_cm; - struct net_device *dest_dev; - struct net_device *src_dev; - struct sfe_ipv6_5tuple *tuple = &msg->tuple; - struct sock *sk; - struct net *net; - unsigned int src_if_idx; - - s32 flow_interface_num = msg->conn_rule.flow_top_interface_num; - s32 return_interface_num = msg->conn_rule.return_top_interface_num; - u32 flow_sawf_tag; - u32 return_sawf_tag; - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE) { - flow_interface_num = msg->conn_rule.flow_interface_num; - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE) { - return_interface_num = msg->conn_rule.return_interface_num; - } - - src_dev = dev_get_by_index(&init_net, flow_interface_num); - if (!src_dev) { - DEBUG_WARN("%px: Unable to find src_dev corresponding to %d\n", msg, - flow_interface_num); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - return -EINVAL; - } - - dest_dev = dev_get_by_index(&init_net, return_interface_num); - if (!dest_dev) { - DEBUG_WARN("%px: Unable to find dest_dev corresponding to %d\n", msg, - return_interface_num); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - dev_put(src_dev); - return -EINVAL; - } - - if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || - (src_dev->reg_state != NETREG_REGISTERED))) { - DEBUG_WARN("%px: src_dev=%s and dest_dev=%s are unregistered\n", msg, - src_dev->name, dest_dev->name); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - dev_put(src_dev); - dev_put(dest_dev); - return -EINVAL; - } - - /* - * Allocate the various connection tracking objects. - */ - c = (struct sfe_ipv6_connection *)kzalloc(sizeof(struct sfe_ipv6_connection), GFP_ATOMIC); - if (unlikely(!c)) { - DEBUG_WARN("%px: memory allocation of connection entry failed\n", msg); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - dev_put(src_dev); - dev_put(dest_dev); - return -ENOMEM; - } - - original_cm = (struct sfe_ipv6_connection_match *)kzalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); - if (unlikely(!original_cm)) { - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - DEBUG_WARN("%px: memory allocation of connection match entry failed\n", msg); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - return -ENOMEM; - } - - reply_cm = (struct sfe_ipv6_connection_match *)kzalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); - if (unlikely(!reply_cm)) { - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - DEBUG_WARN("%px: memory allocation of connection match entry failed\n", msg); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - return -ENOMEM; - } - - this_cpu_inc(si->stats_pcpu->connection_create_requests64); - - spin_lock_bh(&si->lock); - - /* - * Check to see if there is already a flow that matches the rule we're - * trying to create. If there is then we can't create a new one. - */ - old_c = sfe_ipv6_find_connection(si, - tuple->protocol, - (struct sfe_ipv6_addr *)tuple->flow_ip, - tuple->flow_ident, - (struct sfe_ipv6_addr *)tuple->return_ip, - tuple->return_ident); - - if (old_c != NULL) { - this_cpu_inc(si->stats_pcpu->connection_create_collisions64); - - /* - * If we already have the flow then it's likely that this - * request to create the connection rule contains more - * up-to-date information. Check and update accordingly. - */ - sfe_ipv6_update_protocol_state(old_c, msg); - spin_unlock_bh(&si->lock); - - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - - DEBUG_TRACE("connection already exists - p: %d\n" - " s: %s:%pxM:%pI6:%u, d: %s:%pxM:%pI6:%u\n", - tuple->protocol, - src_dev->name, msg->conn_rule.flow_mac, tuple->flow_ip, ntohs(tuple->flow_ident), - dest_dev->name, msg->conn_rule.return_mac, tuple->return_ip, ntohs(tuple->return_ident)); - return -EADDRINUSE; - } - - /* - * Fill in the "original" direction connection matching object. - * Note that the transmit MAC address is "dest_mac_xlate" because - * we always know both ends of a connection by their translated - * addresses and not their public addresses. - */ - original_cm->match_dev = src_dev; - original_cm->match_protocol = tuple->protocol; - original_cm->match_src_ip[0] = *(struct sfe_ipv6_addr *)tuple->flow_ip; - original_cm->match_src_port = netif_is_vxlan(src_dev) ? 0 : tuple->flow_ident; - original_cm->match_dest_ip[0] = *(struct sfe_ipv6_addr *)tuple->return_ip; - original_cm->match_dest_port = tuple->return_ident; - - original_cm->xlate_src_ip[0] = *(struct sfe_ipv6_addr *)tuple->flow_ip; - original_cm->xlate_src_port = tuple->flow_ident; - original_cm->xlate_dest_ip[0] = *(struct sfe_ipv6_addr *)tuple->return_ip; - original_cm->xlate_dest_port = tuple->return_ident; - - original_cm->xmit_dev = dest_dev; - - original_cm->xmit_dev_mtu = msg->conn_rule.return_mtu; - - original_cm->connection = c; - original_cm->counter_match = reply_cm; - - /* - * Valid in decap direction only - */ - RCU_INIT_POINTER(original_cm->up, NULL); - - if (msg->valid_flags & SFE_RULE_CREATE_MARK_VALID) { - original_cm->mark = msg->mark_rule.flow_mark; - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_MARK; - } - if (msg->valid_flags & SFE_RULE_CREATE_QOS_VALID) { - original_cm->priority = msg->qos_rule.flow_qos_tag; - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; - } - - if (msg->valid_flags & SFE_RULE_CREATE_DSCP_MARKING_VALID) { - original_cm->dscp = msg->dscp_rule.flow_dscp << SFE_IPV6_DSCP_SHIFT; - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_FLOW_TRANSMIT_FAST) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION; - } - - /* - * Mark SAWF metadata if the sawf tag is valid. - */ - original_cm->sawf_valid = false; - flow_sawf_tag = SFE_GET_SAWF_TAG(msg->sawf_rule.flow_mark); - if (likely(SFE_SAWF_TAG_IS_VALID(flow_sawf_tag))) { - original_cm->mark = msg->sawf_rule.flow_mark; - original_cm->sawf_valid = true; - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_MARK; - } - - /* - * Add VLAN rule to original_cm - */ - if (msg->valid_flags & SFE_RULE_CREATE_VLAN_VALID) { - struct sfe_vlan_rule *vlan_primary_rule = &msg->vlan_primary_rule; - struct sfe_vlan_rule *vlan_secondary_rule = &msg->vlan_secondary_rule; - sfe_ipv6_match_entry_set_vlan(original_cm, - vlan_primary_rule->ingress_vlan_tag, - vlan_primary_rule->egress_vlan_tag, - vlan_secondary_rule->ingress_vlan_tag, - vlan_secondary_rule->egress_vlan_tag); - - if ((msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE) && - original_cm->egress_vlan_hdr_cnt > 0) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG; - original_cm->l2_hdr_size += original_cm->egress_vlan_hdr_cnt * VLAN_HLEN; - } - } - - if ((IPPROTO_GRE == tuple->protocol) && !sfe_ipv6_is_local_ip(si, (uint8_t *)original_cm->match_dest_ip)) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH; - } - -#ifdef CONFIG_NF_FLOW_COOKIE - original_cm->flow_cookie = 0; -#endif -#ifdef CONFIG_XFRM - if (msg->valid_flags & SFE_RULE_CREATE_DIRECTION_VALID) { - original_cm->flow_accel = msg->direction_rule.flow_accel; - } else { - original_cm->flow_accel = 1; - } -#endif - /* - * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan, - * bottom interfaces are expected to be disabled in the flow rule and always top interfaces - * are used. In such cases, do not use HW csum offload. csum offload is used only when we - * are sending directly to the destination interface that supports it. - */ - if (likely(dest_dev->features & NETIF_F_HW_CSUM) && sfe_dev_has_hw_csum(dest_dev)) { - if ((msg->conn_rule.return_top_interface_num == msg->conn_rule.return_interface_num) || - (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_RETURN_BOTTOM_INTERFACE)) { - /* - * Dont enable CSUM offload - */ -#if 0 - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD; -#endif - } - } - - /* - * Adding PPPoE parameters to original and reply entries based on the direction where - * PPPoE header is valid in ECM rule. - * - * If PPPoE is valid in flow direction (from interface is PPPoE), then - * original cm will have PPPoE at ingress (strip PPPoE header) - * reply cm will have PPPoE at egress (add PPPoE header) - * - * If PPPoE is valid in return direction (to interface is PPPoE), then - * original cm will have PPPoE at egress (add PPPoE header) - * reply cm will have PPPoE at ingress (strip PPPoE header) - */ - if (msg->valid_flags & SFE_RULE_CREATE_PPPOE_DECAP_VALID) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP; - original_cm->pppoe_session_id = msg->pppoe_rule.flow_pppoe_session_id; - ether_addr_copy(original_cm->pppoe_remote_mac, msg->pppoe_rule.flow_pppoe_remote_mac); - - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP; - reply_cm->l2_hdr_size += PPPOE_SES_HLEN; - reply_cm->pppoe_session_id = msg->pppoe_rule.flow_pppoe_session_id; - ether_addr_copy(reply_cm->pppoe_remote_mac, msg->pppoe_rule.flow_pppoe_remote_mac); - } - - if (msg->valid_flags & SFE_RULE_CREATE_PPPOE_ENCAP_VALID) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP; - original_cm->l2_hdr_size += PPPOE_SES_HLEN; - original_cm->pppoe_session_id = msg->pppoe_rule.return_pppoe_session_id; - ether_addr_copy(original_cm->pppoe_remote_mac, msg->pppoe_rule.return_pppoe_remote_mac); - - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP; - reply_cm->pppoe_session_id = msg->pppoe_rule.return_pppoe_session_id; - ether_addr_copy(reply_cm->pppoe_remote_mac, msg->pppoe_rule.return_pppoe_remote_mac); - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_FLOW_SRC_INTERFACE_CHECK) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK; - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_FLOW_SRC_INTERFACE_CHECK_NO_FLUSH) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH; - } - - /* - * For the non-arp interface, we don't write L2 HDR. - * Excluding PPPoE from this, since we are now supporting PPPoE encap/decap. - */ - if (sfe_ipv6_xmit_eth_type_check(dest_dev, original_cm->flags)) { - - /* - * Check whether the rule has configured a specific source MAC address to use. - * This is needed when virtual L3 interfaces such as br-lan, macvlan, vlan are used during egress - */ - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - ether_addr_copy((u8 *)original_cm->xmit_src_mac, (u8 *)msg->conn_rule.flow_mac); - } else { - if ((msg->valid_flags & SFE_RULE_CREATE_SRC_MAC_VALID) && - (msg->src_mac_rule.mac_valid_flags & SFE_SRC_MAC_RETURN_VALID)) { - ether_addr_copy((u8 *)original_cm->xmit_src_mac, (u8 *)msg->src_mac_rule.return_src_mac); - } else { - ether_addr_copy((u8 *)original_cm->xmit_src_mac, (u8 *)dest_dev->dev_addr); - } - } - ether_addr_copy((u8 *)original_cm->xmit_dest_mac, (u8 *)msg->conn_rule.return_mac); - - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; - original_cm->l2_hdr_size += ETH_HLEN; - - /* - * If our dev writes Ethernet headers then we can write a really fast - * version - */ - if (dest_dev->header_ops) { - if (dest_dev->header_ops->create == eth_header) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; - } - } - } - - /* - * Fill in the "reply" direction connection matching object. - */ - reply_cm->match_dev = dest_dev; - reply_cm->match_protocol = tuple->protocol; - reply_cm->match_src_ip[0] = *(struct sfe_ipv6_addr *)tuple->return_ip; - reply_cm->match_dest_ip[0] = *(struct sfe_ipv6_addr *)tuple->flow_ip; - reply_cm->match_dest_port = tuple->flow_ident; - reply_cm->xlate_src_ip[0] = *(struct sfe_ipv6_addr *)tuple->return_ip; - reply_cm->xlate_src_port = tuple->return_ident; - reply_cm->xlate_dest_ip[0] = *(struct sfe_ipv6_addr *)tuple->flow_ip; - reply_cm->xlate_dest_port = tuple->flow_ident; - - /* - * Keep source port as 0 for VxLAN tunnels. - */ - if (netif_is_vxlan(src_dev) || netif_is_vxlan(dest_dev)) { - reply_cm->match_src_port = 0; - } else { - reply_cm->match_src_port = tuple->return_ident; - } - - reply_cm->xmit_dev = src_dev; - reply_cm->xmit_dev_mtu = msg->conn_rule.flow_mtu; - - reply_cm->connection = c; - reply_cm->counter_match = original_cm; - - if (msg->valid_flags & SFE_RULE_CREATE_MARK_VALID) { - reply_cm->mark = msg->mark_rule.return_mark; - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_MARK; - } - if (msg->valid_flags & SFE_RULE_CREATE_QOS_VALID) { - reply_cm->priority = msg->qos_rule.return_qos_tag; - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; - } - if (msg->valid_flags & SFE_RULE_CREATE_DSCP_MARKING_VALID) { - reply_cm->dscp = msg->dscp_rule.return_dscp << SFE_IPV6_DSCP_SHIFT; - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW; - } - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_RETURN_TRANSMIT_FAST) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION; - } - - if ((IPPROTO_GRE == tuple->protocol) && !sfe_ipv6_is_local_ip(si, (uint8_t *)reply_cm->match_dest_ip)) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH; - } - - /* - * Mark return SAWF metadata if the sawf tag is valid. - */ - reply_cm->sawf_valid = false; - return_sawf_tag = SFE_GET_SAWF_TAG(msg->sawf_rule.return_mark); - if (likely(SFE_SAWF_TAG_IS_VALID(return_sawf_tag))) { - reply_cm->mark = msg->sawf_rule.return_mark; - reply_cm->sawf_valid = true; - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_MARK; - } - - /* - * Setup UDP Socket if found to be valid for decap. - */ - RCU_INIT_POINTER(reply_cm->up, NULL); - net = dev_net(reply_cm->match_dev); - src_if_idx = src_dev->ifindex; - - rcu_read_lock(); - - /* - * Look for the associated sock object. - * __udp6_lib_lookup() holds a reference for this sock object, - * which will be released in sfe_ipv6_flush_connection() - */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - sk = __udp6_lib_lookup(net, (const struct in6_addr *)reply_cm->match_dest_ip, - reply_cm->match_dest_port, (const struct in6_addr *)reply_cm->xlate_src_ip, - reply_cm->xlate_src_port, src_if_idx, &udp_table); -#else - sk = __udp6_lib_lookup(net, (const struct in6_addr *)reply_cm->match_dest_ip, - reply_cm->match_dest_port, (const struct in6_addr *)reply_cm->xlate_src_ip, - reply_cm->xlate_src_port, src_if_idx, 0, &udp_table, NULL); -#endif - rcu_read_unlock(); - - /* - * We set the UDP sock pointer as valid only for decap direction. - */ - if (sk && udp_sk(sk)->encap_type) { -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - if (!atomic_add_unless(&sk->sk_refcnt, 1, 0)) { -#else - if (!refcount_inc_not_zero(&sk->sk_refcnt)) { -#endif - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - - DEBUG_INFO("sfe: unable to take reference for socket p:%d\n", tuple->protocol); - DEBUG_INFO("SK: connection - \n" - " s: %s:%pI6(%pI6):%u(%u)\n" - " d: %s:%pI6(%pI6):%u(%u)\n", - reply_cm->match_dev->name, &reply_cm->match_src_ip, &reply_cm->xlate_src_ip, - ntohs(reply_cm->match_src_port), ntohs(reply_cm->xlate_src_port), - reply_cm->xmit_dev->name, &reply_cm->match_dest_ip, &reply_cm->xlate_dest_ip, - ntohs(reply_cm->match_dest_port), ntohs(reply_cm->xlate_dest_port)); - - dev_put(src_dev); - dev_put(dest_dev); - - return -ESHUTDOWN; - } - - rcu_assign_pointer(reply_cm->up, udp_sk(sk)); - DEBUG_INFO("Sock lookup success with reply_cm direction(%p)\n", sk); - DEBUG_INFO("SK: connection - \n" - " s: %s:%pI6(%pI6):%u(%u)\n" - " d: %s:%pI6(%pI6):%u(%u)\n", - reply_cm->match_dev->name, &reply_cm->match_src_ip, &reply_cm->xlate_src_ip, - ntohs(reply_cm->match_src_port), ntohs(reply_cm->xlate_src_port), - reply_cm->xmit_dev->name, &reply_cm->match_dest_ip, &reply_cm->xlate_dest_ip, - ntohs(reply_cm->match_dest_port), ntohs(reply_cm->xlate_dest_port)); - } - - /* - * Add VLAN rule to reply_cm - */ - if (msg->valid_flags & SFE_RULE_CREATE_VLAN_VALID) { - struct sfe_vlan_rule *vlan_primary_rule = &msg->vlan_primary_rule; - struct sfe_vlan_rule *vlan_secondary_rule = &msg->vlan_secondary_rule; - sfe_ipv6_match_entry_set_vlan(reply_cm, - vlan_primary_rule->egress_vlan_tag, - vlan_primary_rule->ingress_vlan_tag, - vlan_secondary_rule->egress_vlan_tag, - vlan_secondary_rule->ingress_vlan_tag); - - if ((msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE) && - reply_cm->egress_vlan_hdr_cnt > 0) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG; - reply_cm->l2_hdr_size += reply_cm->egress_vlan_hdr_cnt * VLAN_HLEN; - } - } - -#ifdef CONFIG_NF_FLOW_COOKIE - reply_cm->flow_cookie = 0; -#endif -#ifdef CONFIG_XFRM - if (msg->valid_flags & SFE_RULE_CREATE_DIRECTION_VALID) { - reply_cm->flow_accel = msg->direction_rule.return_accel; - } else { - reply_cm->flow_accel = 1; - } -#endif - - /* - * the inet6_protocol handler will be used only in decap path - * for non passthrough case. - */ - original_cm->proto = NULL; - reply_cm->proto = NULL; - original_cm->top_interface_dev = NULL; - reply_cm->top_interface_dev = NULL; - -#ifdef SFE_GRE_TUN_ENABLE - if ((IPPROTO_GRE == tuple->protocol) && !(reply_cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH)) { - rcu_read_lock(); - reply_cm->proto = rcu_dereference(inet6_protos[IPPROTO_GRE]); - rcu_read_unlock(); - - if (unlikely(!reply_cm->proto)) { - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - DEBUG_WARN("sfe: GRE proto handler is not registered\n"); - return -EPERM; - } - } -#endif - - if ((IPPROTO_ESP == tuple->protocol) && !(reply_cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH)) { - rcu_read_lock(); - reply_cm->proto = rcu_dereference(inet6_protos[IPPROTO_ESP]); - rcu_read_unlock(); - - if (unlikely(!reply_cm->proto)) { - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - DEBUG_WARN("sfe: ESP proto handler is not registered\n"); - return -EPERM; - } - } - - /* - * Decapsulation path have proto set. - * This is used to differentiate de/encap, and call protocol specific handler. - */ - if (IPPROTO_IPIP == tuple->protocol) { - original_cm->proto = NULL; - rcu_read_lock(); - reply_cm->proto = rcu_dereference(inet6_protos[tuple->protocol]); - rcu_read_unlock(); - reply_cm->top_interface_dev = dev_get_by_index(&init_net, msg->conn_rule.return_top_interface_num); - - if (unlikely(!reply_cm->top_interface_dev)) { - DEBUG_WARN("%px: Unable to find top_interface_dev corresponding to %d\n", msg, - msg->conn_rule.return_top_interface_num); - this_cpu_inc(si->stats_pcpu->connection_create_failures64); - spin_unlock_bh(&si->lock); - kfree(reply_cm); - kfree(original_cm); - kfree(c); - dev_put(src_dev); - dev_put(dest_dev); - return -EINVAL; - } - } - /* - * If l2_features are disabled and flow uses l2 features such as macvlan/bridge/pppoe/vlan, - * bottom interfaces are expected to be disabled in the flow rule and always top interfaces - * are used. In such cases, do not use HW csum offload. csum offload is used only when we - * are sending directly to the destination interface that supports it. - */ - if (likely(src_dev->features & NETIF_F_HW_CSUM) && sfe_dev_has_hw_csum(src_dev)) { - if ((msg->conn_rule.flow_top_interface_num == msg->conn_rule.flow_interface_num) || - (msg->rule_flags & SFE_RULE_CREATE_FLAG_USE_FLOW_BOTTOM_INTERFACE)) { - /* - * Dont enable CSUM offload - */ -#if 0 - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD; -#endif - } - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_RETURN_SRC_INTERFACE_CHECK) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK; - } - - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_RETURN_SRC_INTERFACE_CHECK_NO_FLUSH) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH; - } - - /* - * For the non-arp interface, we don't write L2 HDR. - * Excluding PPPoE from this, since we are now supporting PPPoE encap/decap. - */ - if (sfe_ipv6_xmit_eth_type_check(src_dev, reply_cm->flags)) { - - /* - * Check whether the rule has configured a specific source MAC address to use. - * This is needed when virtual L3 interfaces such as br-lan, macvlan, vlan are used during egress - */ - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { - ether_addr_copy((u8 *)reply_cm->xmit_src_mac, (u8 *)msg->conn_rule.return_mac); - } else { - if ((msg->valid_flags & SFE_RULE_CREATE_SRC_MAC_VALID) && - (msg->src_mac_rule.mac_valid_flags & SFE_SRC_MAC_FLOW_VALID)) { - ether_addr_copy((u8 *)reply_cm->xmit_src_mac, (u8 *)msg->src_mac_rule.flow_src_mac); - } else { - ether_addr_copy((u8 *)reply_cm->xmit_src_mac, (u8 *)src_dev->dev_addr); - } - } - - ether_addr_copy((u8 *)reply_cm->xmit_dest_mac, (u8 *)msg->conn_rule.flow_mac); - - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; - reply_cm->l2_hdr_size += ETH_HLEN; - - /* - * If our dev writes Ethernet headers then we can write a really fast - * version. - */ - if (src_dev->header_ops) { - if (src_dev->header_ops->create == eth_header) { - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; - } - } - } - - /* - * No support for NAT in ipv6 - */ - - /* - * Initialize the protocol-specific information that we track. - */ - switch (tuple->protocol) { - case IPPROTO_TCP: - original_cm->protocol_state.tcp.win_scale = msg->tcp_rule.flow_window_scale; - original_cm->protocol_state.tcp.max_win = msg->tcp_rule.flow_max_window ? msg->tcp_rule.flow_max_window : 1; - original_cm->protocol_state.tcp.end = msg->tcp_rule.flow_end; - original_cm->protocol_state.tcp.max_end = msg->tcp_rule.flow_max_end; - reply_cm->protocol_state.tcp.win_scale = msg->tcp_rule.return_window_scale; - reply_cm->protocol_state.tcp.max_win = msg->tcp_rule.return_max_window ? msg->tcp_rule.return_max_window : 1; - reply_cm->protocol_state.tcp.end = msg->tcp_rule.return_end; - reply_cm->protocol_state.tcp.max_end = msg->tcp_rule.return_max_end; - if (msg->rule_flags & SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK) { - original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; - } - break; - - case IPPROTO_RAW: - /* - * Set src_port to 0 to avoid hash collision in connection match lookups. - */ - original_cm->match_src_port = 0; - original_cm->xlate_src_port = 0; - reply_cm->match_src_port = 0; - reply_cm->xlate_src_port = 0; - break; - } - - /* - * Fill in the ipv6_connection object. - */ - c->protocol = tuple->protocol; - c->src_ip[0] = *(struct sfe_ipv6_addr *)tuple->flow_ip; - c->src_ip_xlate[0] = *(struct sfe_ipv6_addr *)tuple->flow_ip; - c->src_port = tuple->flow_ident; - c->src_port_xlate = tuple->flow_ident; - c->original_dev = src_dev; - c->original_match = original_cm; - - c->dest_ip[0] = *(struct sfe_ipv6_addr *)tuple->return_ip; - c->dest_ip_xlate[0] = *(struct sfe_ipv6_addr *)tuple->return_ip; - c->dest_port = tuple->return_ident; - c->dest_port_xlate = tuple->return_ident; - - c->reply_dev = dest_dev; - c->reply_match = reply_cm; - c->debug_read_seq = 0; - c->last_sync_jiffies = get_jiffies_64(); - c->removed = false; - - sfe_ipv6_connection_match_compute_translations(original_cm); - sfe_ipv6_connection_match_compute_translations(reply_cm); - sfe_ipv6_insert_connection(si, c); - - spin_unlock_bh(&si->lock); - - /* - * We have everything we need! - */ - DEBUG_INFO("%px: new connection - p: %d\n" - " s: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n" - " d: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n", - c, tuple->protocol, - src_dev->name, msg->conn_rule.flow_mac, NULL, - (void *)tuple->flow_ip, (void *)tuple->flow_ip, ntohs(tuple->flow_ident), ntohs(tuple->flow_ident), - dest_dev->name, NULL, msg->conn_rule.return_mac, - (void *)tuple->return_ip, (void *)tuple->return_ip, ntohs(tuple->return_ident), ntohs(tuple->return_ident)); - - return 0; -} - -/* - * sfe_ipv6_destroy_rule() - * Destroy a forwarding rule. - */ -void sfe_ipv6_destroy_rule(struct sfe_ipv6_rule_destroy_msg *msg) -{ - struct sfe_ipv6 *si = &__si6; - struct sfe_ipv6_connection *c; - bool ret; - struct sfe_ipv6_5tuple *tuple = &msg->tuple; - - this_cpu_inc(si->stats_pcpu->connection_destroy_requests64); - - spin_lock_bh(&si->lock); - - /* - * Check to see if we have a flow that matches the rule we're trying - * to destroy. If there isn't then we can't destroy it. - */ - c = sfe_ipv6_find_connection(si, tuple->protocol, (struct sfe_ipv6_addr *)tuple->flow_ip, tuple->flow_ident, - (struct sfe_ipv6_addr *)tuple->return_ip, tuple->return_ident); - if (!c) { - spin_unlock_bh(&si->lock); - - this_cpu_inc(si->stats_pcpu->connection_destroy_misses64); - - DEBUG_TRACE("connection does not exist - p: %d, s: %pI6:%u, d: %pI6:%u\n", - tuple->protocol, tuple->flow_ip, ntohs(tuple->flow_ident), - tuple->return_ip, ntohs(tuple->return_ident)); - return; - } - - /* - * Remove our connection details from the hash tables. - */ - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); - } - - DEBUG_INFO("connection destroyed - p: %d, s: %pI6:%u, d: %pI6:%u\n", - tuple->protocol, tuple->flow_ip, ntohs(tuple->flow_ident), - tuple->return_ip, ntohs(tuple->return_ident)); -} - -/* - * sfe_ipv6_sync_invoke() - * Schedule many sync stats. - */ -bool sfe_ipv6_sync_invoke(uint16_t index) -{ - struct sfe_ipv6 *si = &__si6; - return schedule_delayed_work_on(si->work_cpu, &(si->sync_dwork), 0); -} - -/* - * sfe_ipv6_register_sync_rule_callback() - * Register a callback for rule synchronization. - */ -void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) -{ - struct sfe_ipv6 *si = &__si6; - - spin_lock_bh(&si->lock); - rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); - spin_unlock_bh(&si->lock); -} - -/* - * sfe_ipv6_register_sync_rule_callback() - * Register a callback for rule synchronization. - */ -void sfe_ipv6_register_many_sync_callback(sfe_ipv6_many_sync_callback_t cb) -{ - struct sfe_ipv6 *si = &__si6; - - spin_lock_bh(&si->lock); - rcu_assign_pointer(si->many_sync_callback, cb); - spin_unlock_bh(&si->lock); -} - -/* - * sfe_ipv6_get_debug_dev() - */ -static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ipv6 *si = &__si6; - ssize_t count; - int num; - - spin_lock_bh(&si->lock); - num = si->debug_dev; - spin_unlock_bh(&si->lock); - - count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); - return count; -} - -/* - * sfe_ipv6_destroy_all_rules_for_dev() - * Destroy all connections that match a particular device. - * - * If we pass dev as NULL then this destroys all connections. - */ -void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) -{ - struct sfe_ipv6 *si = &__si6; - struct sfe_ipv6_connection *c; - bool ret; - -another_round: - spin_lock_bh(&si->lock); - - for (c = si->all_connections_head; c; c = c->all_connections_next) { - /* - * Does this connection relate to the device we are destroying? - */ - if (!dev - || (dev == c->original_dev) - || (dev == c->reply_dev)) { - break; - } - } - - if (c) { - ret = sfe_ipv6_remove_connection(si, c); - } - - spin_unlock_bh(&si->lock); - - if (c) { - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); - } - goto another_round; - } -} - -/* - * sfe_ipv6_periodic_sync() - */ -static void sfe_ipv6_periodic_sync(struct work_struct *work) -{ - struct sfe_ipv6 *si = container_of((struct delayed_work *)work, struct sfe_ipv6, sync_dwork); - u64 now_jiffies; - int quota; - sfe_ipv6_many_sync_callback_t sync_rule_callback; - struct sfe_ipv6_connection *c; - struct sfe_ipv6_conn_sync *conn_sync; - - now_jiffies = get_jiffies_64(); - - rcu_read_lock(); - sync_rule_callback = rcu_dereference(si->many_sync_callback); - rcu_read_unlock(); - if (!sync_rule_callback) { - return; - } - - spin_lock_bh(&si->lock); - - /* - * If we have reached the end of the connection list, walk from - * the connection head. - */ - c = si->wc_next; - if (unlikely(!c)) { - c = si->all_connections_head; - } - - /* - * Get the max number of connections to be put in this sync msg. - */ - quota = sfe_ipv6_sync_max_number; - conn_sync = sfe_ipv6_sync_many_msg->msg.conn_stats_many.conn_sync; - - /* - * Walk the "all connection" list and sync the connection state. - */ - while (likely(c && quota)) { - struct sfe_ipv6_connection_match *cm; - struct sfe_ipv6_connection_match *counter_cm; - struct sfe_connection_sync sis; - - cm = c->original_match; - counter_cm = c->reply_match; - - /* - * Didn't receive packets in the origial direction or reply - * direction, move to the next connection. - */ - if (!atomic_read(&cm->rx_packet_count) && !atomic_read(&counter_cm->rx_packet_count)) { - c = c->all_connections_next; - continue; - } - - /* - * Sync the connection state. - */ - sfe_ipv6_gen_sync_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); - sfe_ipv6_stats_convert(conn_sync, &sis); - - quota--; - conn_sync++; - c = c->all_connections_next; - } - - /* - * At the end of loop, put wc_next to the connection we left - */ - si->wc_next = c; - spin_unlock_bh(&si->lock); - - if (c == NULL) { - DEBUG_INFO("Synced all connections\n"); - sfe_ipv6_sync_many_msg->msg.conn_stats_many.next = 0; - } else { - DEBUG_INFO("Some connections left\n"); - sfe_ipv6_sync_many_msg->msg.conn_stats_many.next = sfe_ipv6_sync_max_number - quota; - } - DEBUG_INFO("Synced [%d] connections\n", (sfe_ipv6_sync_max_number - quota)); - - sfe_ipv6_sync_many_msg->msg.conn_stats_many.count = sfe_ipv6_sync_max_number - quota; - sfe_ipv6_sync_many_msg->cm.response = SFE_CMN_RESPONSE_ACK; - - sync_rule_callback(sfe_ipv6_sync_many_msg); -} - -/* - * sfe_ipv6_debug_dev_read_start() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - - si->debug_read_seq++; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv6_debug_dev_read_connections_start() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_connections_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv6_debug_dev_read_connections_connection() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_connections_connection(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - struct sfe_ipv6_connection *c; - struct sfe_ipv6_connection_match *original_cm; - struct sfe_ipv6_connection_match *reply_cm; - int bytes_read; - int protocol; - struct net_device *src_dev; - struct sfe_ipv6_addr src_ip; - struct sfe_ipv6_addr src_ip_xlate; - __be16 src_port; - __be16 src_port_xlate; - u64 src_rx_packets; - u64 src_rx_bytes; - struct net_device *dest_dev; - struct sfe_ipv6_addr dest_ip; - struct sfe_ipv6_addr dest_ip_xlate; - __be16 dest_port; - __be16 dest_port_xlate; - u64 dest_rx_packets; - u64 dest_rx_bytes; - u64 last_sync_jiffies; - u32 src_mark, dest_mark, src_priority, dest_priority, src_dscp, dest_dscp; - bool original_cm_sawf_valid, reply_cm_sawf_valid; - u32 flow_service_class, return_service_class; - u32 flow_msduq, return_msduq; - u32 packet, byte, original_cm_flags; - u16 pppoe_session_id; - u8 pppoe_remote_mac[ETH_ALEN]; - u32 original_fast_xmit, reply_fast_xmit; -#ifdef CONFIG_NF_FLOW_COOKIE - int src_flow_cookie, dst_flow_cookie; -#endif - - spin_lock_bh(&si->lock); - - for (c = si->all_connections_head; c; c = c->all_connections_next) { - if (c->debug_read_seq < si->debug_read_seq) { - c->debug_read_seq = si->debug_read_seq; - break; - } - } - - /* - * If there were no connections then move to the next state. - */ - if (!c) { - spin_unlock_bh(&si->lock); - ws->state++; - return true; - } - - original_cm = c->original_match; - reply_cm = c->reply_match; - - protocol = c->protocol; - src_dev = c->original_dev; - src_ip = c->src_ip[0]; - src_ip_xlate = c->src_ip_xlate[0]; - src_port = c->src_port; - src_port_xlate = c->src_port_xlate; - src_priority = original_cm->priority; - src_dscp = original_cm->dscp >> SFE_IPV6_DSCP_SHIFT; - - sfe_ipv6_connection_match_update_summary_stats(original_cm, &packet, &byte); - sfe_ipv6_connection_match_update_summary_stats(reply_cm, &packet, &byte); - - src_rx_packets = original_cm->rx_packet_count64; - src_rx_bytes = original_cm->rx_byte_count64; - src_mark = original_cm->mark; - original_fast_xmit = original_cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - dest_dev = c->reply_dev; - dest_ip = c->dest_ip[0]; - dest_ip_xlate = c->dest_ip_xlate[0]; - dest_port = c->dest_port; - dest_port_xlate = c->dest_port_xlate; - dest_priority = reply_cm->priority; - dest_dscp = reply_cm->dscp >> SFE_IPV6_DSCP_SHIFT; - dest_rx_packets = reply_cm->rx_packet_count64; - dest_rx_bytes = reply_cm->rx_byte_count64; - last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; - original_cm_flags = original_cm->flags; - pppoe_session_id = original_cm->pppoe_session_id; - ether_addr_copy(pppoe_remote_mac, original_cm->pppoe_remote_mac); - dest_mark = reply_cm->mark; - reply_fast_xmit = reply_cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - original_cm_sawf_valid = original_cm->sawf_valid; - reply_cm_sawf_valid = reply_cm->sawf_valid; - flow_service_class = SFE_GET_SAWF_SERVICE_CLASS(original_cm->mark); - flow_msduq = SFE_GET_SAWF_MSDUQ(original_cm->mark); - return_service_class = SFE_GET_SAWF_SERVICE_CLASS(reply_cm->mark); - return_msduq = SFE_GET_SAWF_MSDUQ(reply_cm->mark); - -#ifdef CONFIG_NF_FLOW_COOKIE - src_flow_cookie = original_cm->flow_cookie; - dst_flow_cookie = reply_cm->flow_cookie; -#endif - spin_unlock_bh(&si->lock); - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\tname, - &src_ip, &src_ip_xlate, - ntohs(src_port), ntohs(src_port_xlate), - src_priority, src_dscp, - src_rx_packets, src_rx_bytes, - src_mark, - original_fast_xmit ? "Yes" : "No", - dest_dev->name, - &dest_ip, &dest_ip_xlate, - ntohs(dest_port), ntohs(dest_port_xlate), - dest_priority, dest_dscp, - dest_rx_packets, dest_rx_bytes, - dest_mark, - reply_fast_xmit ? "Yes" : "No", -#ifdef CONFIG_NF_FLOW_COOKIE - src_flow_cookie, dst_flow_cookie, -#endif - last_sync_jiffies); - - if (original_cm_flags &= (SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP | SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "pppoe_session_id=\"%u\" pppoe_server_MAC=\"%pM\" ", - pppoe_session_id, pppoe_remote_mac); - } - - if (original_cm_sawf_valid) { - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "flow_service_class=\"%d\" flow_msduq= \"0x%x\" ", - flow_service_class, flow_msduq); - } - - if (reply_cm_sawf_valid) { - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, "return_service_class=\"%d\" return_msduq= \"0x%x\" ", - return_service_class, return_msduq); - } - - bytes_read += snprintf(msg + bytes_read, CHAR_DEV_MSG_SIZE, ")/>\n"); - - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - return true; -} - -/* - * sfe_ipv6_debug_dev_read_connections_end() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_connections_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv6_debug_dev_read_exceptions_start() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_exceptions_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv6_debug_dev_read_exceptions_exception() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_exceptions_exception(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int i; - u64 val = 0; - - for_each_possible_cpu(i) { - const struct sfe_ipv6_stats *s = per_cpu_ptr(si->stats_pcpu, i); - val += s->exception_events64[ws->iter_exception]; - } - - if (val) { - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, - "\t\t\n", - sfe_ipv6_exception_events_string[ws->iter_exception], - val); - - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - } - - ws->iter_exception++; - if (ws->iter_exception >= SFE_IPV6_EXCEPTION_EVENT_LAST) { - ws->iter_exception = 0; - ws->state++; - } - - return true; -} - -/* - * sfe_ipv6_debug_dev_read_exceptions_end() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_exceptions_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv6_debug_dev_read_stats() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_stats(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - struct sfe_ipv6_stats stats; - unsigned int num_conn; - - sfe_ipv6_update_summary_stats(si, &stats); - - spin_lock_bh(&si->lock); - num_conn = si->num_connections; - spin_unlock_bh(&si->lock); - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", - num_conn, - stats.packets_dropped64, - stats.packets_fast_xmited64, - stats.packets_forwarded64, - stats.packets_not_forwarded64, - stats.connection_create_requests64, - stats.connection_create_collisions64, - stats.connection_create_failures64, - stats.connection_destroy_requests64, - stats.connection_destroy_misses64, - stats.connection_flushes64, - stats.connection_match_hash_hits64, - stats.connection_match_hash_reorders64, - stats.pppoe_encap_packets_forwarded64, - stats.pppoe_decap_packets_forwarded64, - stats.pppoe_bridge_packets_forwarded64, - stats.pppoe_bridge_packets_3tuple_forwarded64); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * sfe_ipv6_debug_dev_read_end() - * Generate part of the XML output. - */ -static bool sfe_ipv6_debug_dev_read_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) -{ - int bytes_read; - - bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); - if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { - return false; - } - - *length -= bytes_read; - *total_read += bytes_read; - - ws->state++; - return true; -} - -/* - * Array of write functions that write various XML elements that correspond to - * our XML output state machine. - */ -static sfe_ipv6_debug_xml_write_method_t sfe_ipv6_debug_xml_write_methods[SFE_IPV6_DEBUG_XML_STATE_DONE] = { - sfe_ipv6_debug_dev_read_start, - sfe_ipv6_debug_dev_read_connections_start, - sfe_ipv6_debug_dev_read_connections_connection, - sfe_ipv6_debug_dev_read_connections_end, - sfe_ipv6_debug_dev_read_exceptions_start, - sfe_ipv6_debug_dev_read_exceptions_exception, - sfe_ipv6_debug_dev_read_exceptions_end, - sfe_ipv6_debug_dev_read_stats, - sfe_ipv6_debug_dev_read_end, -}; - -/* - * sfe_ipv6_debug_dev_read() - * Send info to userspace upon read request from user - */ -static ssize_t sfe_ipv6_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) -{ - char msg[CHAR_DEV_MSG_SIZE]; - int total_read = 0; - struct sfe_ipv6_debug_xml_write_state *ws; - struct sfe_ipv6 *si = &__si6; - - ws = (struct sfe_ipv6_debug_xml_write_state *)filp->private_data; - while ((ws->state != SFE_IPV6_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { - if ((sfe_ipv6_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { - continue; - } - } - return total_read; -} - -/* - * sfe_ipv6_debug_dev_open() - */ -static int sfe_ipv6_debug_dev_open(struct inode *inode, struct file *file) -{ - struct sfe_ipv6_debug_xml_write_state *ws; - - ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; - if (ws) { - return 0; - } - - ws = kzalloc(sizeof(struct sfe_ipv6_debug_xml_write_state), GFP_KERNEL); - if (!ws) { - return -ENOMEM; - } - - ws->state = SFE_IPV6_DEBUG_XML_STATE_START; - file->private_data = ws; - - return 0; -} - -/* - * sfe_ipv6_debug_dev_release() - */ -static int sfe_ipv6_debug_dev_release(struct inode *inode, struct file *file) -{ - struct sfe_ipv6_debug_xml_write_state *ws; - - ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; - if (ws) { - /* - * We've finished with our output so free the write state. - */ - kfree(ws); - file->private_data = NULL; - } - - return 0; -} - -/* - * File operations used in the debug char device - */ -static struct file_operations sfe_ipv6_debug_dev_fops = { - .read = sfe_ipv6_debug_dev_read, - .open = sfe_ipv6_debug_dev_open, - .release = sfe_ipv6_debug_dev_release -}; - -#ifdef CONFIG_NF_FLOW_COOKIE -/* - * sfe_ipv6_register_flow_cookie_cb - * register a function in SFE to let SFE use this function to configure flow cookie for a flow - * - * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE - * can use this function to configure flow cookie for a flow. - * return: 0, success; !=0, fail - */ -int sfe_ipv6_register_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) -{ - struct sfe_ipv6 *si = &__si6; - - BUG_ON(!cb); - - if (si->flow_cookie_set_func) { - return -1; - } - - rcu_assign_pointer(si->flow_cookie_set_func, cb); - return 0; -} - -/* - * sfe_ipv6_unregister_flow_cookie_cb - * unregister function which is used to configure flow cookie for a flow - * - * return: 0, success; !=0, fail - */ -int sfe_ipv6_unregister_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) -{ - struct sfe_ipv6 *si = &__si6; - - RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); - return 0; -} - -/* - * sfe_ipv6_get_flow_cookie() - */ -static ssize_t sfe_ipv6_get_flow_cookie(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ipv6 *si = &__si6; - return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); -} - -/* - * sfe_ipv6_set_flow_cookie() - */ -static ssize_t sfe_ipv6_set_flow_cookie(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) -{ - struct sfe_ipv6 *si = &__si6; - si->flow_cookie_enable = strict_strtol(buf, NULL, 0); - - return size; -} - -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_ipv6_flow_cookie_attr = - __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv6_get_flow_cookie, sfe_ipv6_set_flow_cookie); -#endif /*CONFIG_NF_FLOW_COOKIE*/ - -/* - * sfe_ipv6_get_cpu() - */ -static ssize_t sfe_ipv6_get_cpu(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct sfe_ipv6 *si = &__si6; - return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->work_cpu); -} - -/* - * sfe_ipv6_set_cpu() - */ -static ssize_t sfe_ipv6_set_cpu(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t size) -{ - struct sfe_ipv6 *si = &__si6; - int work_cpu; - - work_cpu = simple_strtol(buf, NULL, 0); - if ((work_cpu >= 0) && (work_cpu <= NR_CPUS)) { - si->work_cpu = work_cpu; - } else { - dev_err(dev, "%s is not in valid range[0,%d]", buf, NR_CPUS); - } - - return size; -} -/* - * sysfs attributes. - */ -static const struct device_attribute sfe_ipv6_cpu_attr = - __ATTR(stat_work_cpu, S_IWUSR | S_IRUGO, sfe_ipv6_get_cpu, sfe_ipv6_set_cpu); - - /* - * sfe_ipv6_hash_init() - * Initialize conn match hash lists - */ -static void sfe_ipv6_conn_match_hash_init(struct sfe_ipv6 *si, int len) -{ - struct hlist_head *hash_list = si->hlist_conn_match_hash_head; - int i; - - for (i = 0; i < len; i++) { - INIT_HLIST_HEAD(&hash_list[i]); - } -} - -#ifdef SFE_PROCESS_LOCAL_OUT -/* - * sfe_ipv6_local_out() - * Called for packets from ip_local_out() - post encapsulation & other packets - */ -static unsigned int sfe_ipv6_local_out(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct sfe_l2_info l2_info = {0}; - - DEBUG_TRACE("sfe: sfe_ipv6_local_out hook called.\n"); - - if (likely(skb->skb_iif)) { - return sfe_ipv6_recv(skb->dev, skb, &l2_info, true) ? NF_STOLEN : NF_ACCEPT; - } - - return NF_ACCEPT; -} - -/* - * struct nf_hook_ops sfe_ipv6_ops_local_out[] - * Hooks into netfilter local out packet monitoring points. - */ -static struct nf_hook_ops sfe_ipv6_ops_local_out[] __read_mostly = { - - /* - * Local out routing hook is used to monitor packets. - */ - { - .hook = sfe_ipv6_local_out, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_FIRST, - }, -}; -#endif - -/* - * sfe_ipv6_init() - */ -int sfe_ipv6_init(void) -{ - struct sfe_ipv6 *si = &__si6; - int result = -1; - - DEBUG_INFO("SFE IPv6 init\n"); - - sfe_ipv6_conn_match_hash_init(si, ARRAY_SIZE(si->hlist_conn_match_hash_head)); - - si->stats_pcpu = alloc_percpu_gfp(struct sfe_ipv6_stats, GFP_KERNEL | __GFP_ZERO); - if (!si->stats_pcpu) { - DEBUG_ERROR("failed to allocate stats memory for sfe_ipv6\n"); - goto exit0; - } - - /* - * Allocate per cpu per service class memory. - */ - si->stats_pcpu_psc = alloc_percpu_gfp(struct sfe_ipv6_service_class_stats_db, - GFP_KERNEL | __GFP_ZERO); - if (!si->stats_pcpu_psc) { - DEBUG_ERROR("failed to allocate per cpu per service clas stats memory\n"); - goto exit1; - } - - /* - * Create sys/sfe_ipv6 - */ - si->sys_ipv6 = kobject_create_and_add("sfe_ipv6", NULL); - if (!si->sys_ipv6) { - DEBUG_ERROR("failed to register sfe_ipv6\n"); - goto exit2; - } - - /* - * Create files, one for each parameter supported by this module. - */ - result = sysfs_create_file(si->sys_ipv6, &sfe_ipv6_debug_dev_attr.attr); - if (result) { - DEBUG_ERROR("failed to register debug dev file: %d\n", result); - goto exit3; - } - - result = sysfs_create_file(si->sys_ipv6, &sfe_ipv6_cpu_attr.attr); - if (result) { - DEBUG_ERROR("failed to register debug dev file: %d\n", result); - goto exit4; - } - -#ifdef CONFIG_NF_FLOW_COOKIE - result = sysfs_create_file(si->sys_ipv6, &sfe_ipv6_flow_cookie_attr.attr); - if (result) { - DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); - goto exit5; - } -#endif /* CONFIG_NF_FLOW_COOKIE */ - -#ifdef SFE_PROCESS_LOCAL_OUT -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - result = nf_register_hooks(sfe_ipv6_ops_local_out, ARRAY_SIZE(sfe_ipv6_ops_local_out)); -#else - result = nf_register_net_hooks(&init_net, sfe_ipv6_ops_local_out, ARRAY_SIZE(sfe_ipv6_ops_local_out)); -#endif - if (result < 0) { - DEBUG_ERROR("can't register nf local out hook: %d\n", result); - goto exit6; - } - DEBUG_INFO("Register nf local out hook success: %d\n", result); -#endif - - /* - * Register our debug char device. - */ - result = register_chrdev(0, "sfe_ipv6", &sfe_ipv6_debug_dev_fops); - if (result < 0) { - DEBUG_ERROR("Failed to register chrdev: %d\n", result); - goto exit7; - } - - si->debug_dev = result; - si->work_cpu = WORK_CPU_UNBOUND; - - /* - * Create work to handle periodic statistics. - */ - INIT_DELAYED_WORK(&(si->sync_dwork), sfe_ipv6_periodic_sync); - - sfe_ipv6_sync_many_msg = kzalloc(PAGE_SIZE, GFP_KERNEL); - if(!sfe_ipv6_sync_many_msg) { - goto exit8; - } - - sfe_ipv6_msg_init(sfe_ipv6_sync_many_msg, SFE_SPECIAL_INTERFACE_IPV6, - SFE_TX_CONN_STATS_SYNC_MANY_MSG, - sizeof(struct sfe_ipv4_conn_sync_many_msg), - NULL, - NULL); - sfe_ipv6_sync_max_number = (PAGE_SIZE - sizeof(struct sfe_ipv6_msg)) / sizeof(struct sfe_ipv6_conn_sync); - - spin_lock_init(&si->lock); - return 0; - -exit8: - unregister_chrdev(si->debug_dev, "sfe_ipv6"); - -exit7: -#ifdef SFE_PROCESS_LOCAL_OUT -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - DEBUG_TRACE("sfe: Unregister local out hook\n"); - nf_unregister_hooks(sfe_ipv6_ops_local_out, ARRAY_SIZE(sfe_ipv6_ops_local_out)); -#else - DEBUG_TRACE("sfe: Unregister local out hook\n"); - nf_unregister_net_hooks(&init_net, sfe_ipv6_ops_local_out, ARRAY_SIZE(sfe_ipv6_ops_local_out)); -#endif -exit6: -#endif -#ifdef CONFIG_NF_FLOW_COOKIE - sysfs_remove_file(si->sys_ipv6, &sfe_ipv6_flow_cookie_attr.attr); - -exit5: -#endif /* CONFIG_NF_FLOW_COOKIE */ - sysfs_remove_file(si->sys_ipv6, &sfe_ipv6_cpu_attr.attr); - -exit4: - sysfs_remove_file(si->sys_ipv6, &sfe_ipv6_debug_dev_attr.attr); - -exit3: - kobject_put(si->sys_ipv6); - -exit2: - free_percpu(si->stats_pcpu_psc); - -exit1: - free_percpu(si->stats_pcpu); - -exit0: - return result; -} - -/* - * sfe_ipv6_exit() - */ -void sfe_ipv6_exit(void) -{ - struct sfe_ipv6 *si = &__si6; - - DEBUG_INFO("SFE IPv6 exit\n"); - - /* - * Destroy all connections. - */ - sfe_ipv6_destroy_all_rules_for_dev(NULL); - - cancel_delayed_work(&si->sync_dwork); - - unregister_chrdev(si->debug_dev, "sfe_ipv6"); - - free_percpu(si->stats_pcpu); - free_percpu(si->stats_pcpu_psc); - -#ifdef SFE_PROCESS_LOCAL_OUT -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - DEBUG_TRACE("sfe: Unregister local out hook\n"); - nf_unregister_hooks(sfe_ipv6_ops_local_out, ARRAY_SIZE(sfe_ipv6_ops_local_out)); -#else - DEBUG_TRACE("sfe: Unregister local out hook\n"); - nf_unregister_net_hooks(&init_net, sfe_ipv6_ops_local_out, ARRAY_SIZE(sfe_ipv6_ops_local_out)); -#endif -#endif - -#ifdef CONFIG_NF_FLOW_COOKIE - sysfs_remove_file(si->sys_ipv6, &sfe_ipv6_flow_cookie_attr.attr); -#endif /* CONFIG_NF_FLOW_COOKIE */ - - sysfs_remove_file(si->sys_ipv6, &sfe_ipv6_cpu_attr.attr); - - sysfs_remove_file(si->sys_ipv6, &sfe_ipv6_debug_dev_attr.attr); - - kobject_put(si->sys_ipv6); -} - -#ifdef CONFIG_NF_FLOW_COOKIE -EXPORT_SYMBOL(sfe_ipv6_register_flow_cookie_cb); -EXPORT_SYMBOL(sfe_ipv6_unregister_flow_cookie_cb); -#endif diff --git a/shortcut-fe/sfe_ipv6.h b/shortcut-fe/sfe_ipv6.h deleted file mode 100644 index 2a84c4338..000000000 --- a/shortcut-fe/sfe_ipv6.h +++ /dev/null @@ -1,496 +0,0 @@ -/* - * sfe_ipv6.h - * Shortcut forwarding engine header file for IPv6. - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __SFE_IPV6_H -#define __SFE_IPV6_H - -#define CHAR_DEV_MSG_SIZE 768 - -#define SFE_IPV6_DSCP_MASK 0xf03f -#define SFE_IPV6_DSCP_SHIFT 2 - -#define SFE_IPV6_FRAG_OFFSET 0xfff8 - -/* - * generic IPv6 extension header - */ -struct sfe_ipv6_ext_hdr { - __u8 next_hdr; - __u8 hdr_len; - __u8 padding[6]; -}; - -/* - * Specifies the lower bound on ACK numbers carried in the TCP header - */ -#define SFE_IPV6_TCP_MAX_ACK_WINDOW 65520 - -/* - * IPv6 TCP connection match additional data. - */ -struct sfe_ipv6_tcp_connection_match { - u8 win_scale; /* Window scale */ - u32 max_win; /* Maximum window size seen */ - u32 end; /* Sequence number of the next byte to send (seq + segment length) */ - u32 max_end; /* Sequence number of the last byte to ack */ -}; - -/* - * Bit flags for IPv6 connection matching entry. - */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) - /* Perform source translation */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) - /* Perform destination translation */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) - /* Ignore TCP sequence numbers */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) - /* Fast Ethernet header write */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) - /* Fast Ethernet header write */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) - /* remark priority of SKB */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) - /* remark DSCP of packet */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD (1<<7) - /* checksum offload.*/ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP (1<<8) - /* Indicates that PPPoE should be decapsulated */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP (1<<9) - /* Indicates that PPPoE should be encapsulated */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW (1<<10) - /* Bridge flow */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_MARK (1<<11) - /* set skb mark*/ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG (1<<12) - /* Insert VLAN tag */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK (1<<13) - /* Source interface check */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH (1<<14) - /* passthrough flow: encap/decap to be skipped for this flow */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT (1<<15) - /* go fast xmit*/ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED (1<<16) - /* fast xmit checked or not*/ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION (1<<17) - /* Fast xmit may be possible for this flow, if SFE check passes */ -#define SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH (1<<18) - /* Source interface check but do not flush the connection */ - -/* - * IPv6 connection matching structure. - */ -struct sfe_ipv6_connection_match { - /* - * References to other objects. - */ - struct hlist_node hnode; - struct sfe_ipv6_connection *connection; - struct sfe_ipv6_connection_match *counter_match; - /* Matches the flow in the opposite direction as the one in connection */ - /* - * Characteristics that identify flows that match this rule. - */ - struct net_device *match_dev; /* Network device */ - u8 match_protocol; /* Protocol */ - struct sfe_ipv6_addr match_src_ip[1]; /* Source IP address */ - struct sfe_ipv6_addr match_dest_ip[1]; /* Destination IP address */ - __be16 match_src_port; /* Source port/connection ident */ - __be16 match_dest_port; /* Destination port/connection ident */ - - struct udp_sock *up; /* Stores UDP sock information; valid only in decap path */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - const struct inet6_protocol *proto; /* stores protocol handler; valid only in decap path */ -#else - struct inet6_protocol *proto; /* stores protocol handler; valid only in decap path */ -#endif - - /* - * Control the operations of the match. - */ - u32 flags; /* Bit flags */ -#ifdef CONFIG_NF_FLOW_COOKIE - u32 flow_cookie; /* used flow cookie, for debug */ -#endif -#ifdef CONFIG_XFRM - u32 flow_accel; /* The flow accelerated or not */ -#endif - - /* - * Connection state that we track once we match. - */ - union { /* Protocol-specific state */ - struct sfe_ipv6_tcp_connection_match tcp; - } protocol_state; - - /* - * VLAN headers - */ - struct sfe_vlan_hdr ingress_vlan_hdr[SFE_MAX_VLAN_DEPTH]; - struct sfe_vlan_hdr egress_vlan_hdr[SFE_MAX_VLAN_DEPTH]; - - /* - * Stats recorded in a sync period. These stats will be added to - * rx_packet_count64/rx_byte_count64 after a sync period. - */ - atomic_t rx_packet_count; - atomic_t rx_byte_count; - - /* - * Packet translation information. - */ - struct sfe_ipv6_addr xlate_src_ip[1]; /* Address after source translation */ - __be16 xlate_src_port; /* Port/connection ident after source translation */ - u16 xlate_src_csum_adjustment; - /* Transport layer checksum adjustment after source translation */ - struct sfe_ipv6_addr xlate_dest_ip[1]; /* Address after destination translation */ - __be16 xlate_dest_port; /* Port/connection ident after destination translation */ - u16 xlate_dest_csum_adjustment; - /* Transport layer checksum adjustment after destination translation */ - u32 mark; /* mark for outgoing packet */ - - /* - * QoS information - */ - u32 priority; - u32 dscp; - - /* - * Packet transmit information. - */ - struct net_device *xmit_dev; /* Network device on which to transmit */ - unsigned short int xmit_dev_mtu; - /* Interface MTU */ - u16 xmit_dest_mac[ETH_ALEN / 2]; - /* Destination MAC address to use when forwarding */ - u16 xmit_src_mac[ETH_ALEN / 2]; - /* Source MAC address to use when forwarding */ - - u8 ingress_vlan_hdr_cnt; /* Ingress active vlan headers count */ - u8 egress_vlan_hdr_cnt; /* Egress active vlan headers count */ - - /* - * Summary stats. - */ - u64 rx_packet_count64; - u64 rx_byte_count64; - - /* - * PPPoE information. - */ - u16 pppoe_session_id; - u8 pppoe_remote_mac[ETH_ALEN]; - - struct net_device *top_interface_dev; /* Used by tunipip6 to store decap VLAN netdevice.*/ - - /* - * Size of all needed L2 headers - */ - u16 l2_hdr_size; - - /* - * xmit device's feature - */ - netdev_features_t features; - - bool sawf_valid; /* Indicates mark has valid SAWF information. */ -}; - -/* - * Per-connection data structure. - */ -struct sfe_ipv6_connection { - struct sfe_ipv6_connection *next; - /* Pointer to the next entry in a hash chain */ - struct sfe_ipv6_connection *prev; - /* Pointer to the previous entry in a hash chain */ - int protocol; /* IP protocol number */ - struct sfe_ipv6_addr src_ip[1]; /* Src IP addr pre-translation */ - struct sfe_ipv6_addr src_ip_xlate[1]; /* Src IP addr post-translation */ - struct sfe_ipv6_addr dest_ip[1]; /* Dest IP addr pre-translation */ - struct sfe_ipv6_addr dest_ip_xlate[1]; /* Dest IP addr post-translation */ - __be16 src_port; /* Src port pre-translation */ - __be16 src_port_xlate; /* Src port post-translation */ - __be16 dest_port; /* Dest port pre-translation */ - __be16 dest_port_xlate; /* Dest port post-translation */ - struct sfe_ipv6_connection_match *original_match; - /* Original direction matching structure */ - struct net_device *original_dev; - /* Original direction source device */ - struct sfe_ipv6_connection_match *reply_match; - /* Reply direction matching structure */ - struct net_device *reply_dev; /* Reply direction source device */ - u64 last_sync_jiffies; /* Jiffies count for the last sync */ - struct sfe_ipv6_connection *all_connections_next; - /* Pointer to the next entry in the list of all connections */ - struct sfe_ipv6_connection *all_connections_prev; - /* Pointer to the previous entry in the list of all connections */ - bool removed; /* Indicates the connection is removed */ - struct rcu_head rcu; /* delay rcu free */ - u32 debug_read_seq; /* sequence number for debug dump */ -}; - -/* - * IPv6 connections and hash table size information. - */ -#define SFE_IPV6_CONNECTION_HASH_SHIFT 12 -#define SFE_IPV6_CONNECTION_HASH_SIZE (1 << SFE_IPV6_CONNECTION_HASH_SHIFT) -#define SFE_IPV6_CONNECTION_HASH_MASK (SFE_IPV6_CONNECTION_HASH_SIZE - 1) - -enum sfe_ipv6_exception_events { - SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION, - SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL, - SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, - SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, - SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, - SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL, - SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, - SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS, - SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, - SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, - SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK, - SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, - SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, - SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, - SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6, - SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_IP_OPTIONS_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UDP_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_TCP_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL, - SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION, - SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, - SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, - SFE_IPV6_EXCEPTION_EVENT_NON_V6, - SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, - SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, - SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL, - SFE_IPV6_EXCEPTION_EVENT_NO_HEADROOM, - SFE_IPV6_EXCEPTION_EVENT_INVALID_PPPOE_SESSION, - SFE_IPV6_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING, - SFE_IPV6_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME, - SFE_IPV6_EXCEPTION_EVENT_PPPOE_BR_NOT_IN_CME, - SFE_IPV6_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH, - SFE_IPV6_EXCEPTION_EVENT_INVALID_SRC_IFACE, - SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_NO_CONNECTION, - SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_SMALL_TTL, - SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_NEEDS_FRAGMENTATION, - SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_SYNC_ON_FIND, - SFE_IPV6_EXCEPTION_EVENT_GRE_HEADER_INCOMPLETE, - SFE_IPV6_EXCEPTION_EVENT_GRE_NO_CONNECTION, - SFE_IPV6_EXCEPTION_EVENT_GRE_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV6_EXCEPTION_EVENT_GRE_SMALL_TTL, - SFE_IPV6_EXCEPTION_EVENT_GRE_NEEDS_FRAGMENTATION, - SFE_IPV6_EXCEPTION_EVENT_ESP_NO_CONNECTION, - SFE_IPV6_EXCEPTION_EVENT_ESP_IP_OPTIONS_OR_INITIAL_FRAGMENT, - SFE_IPV6_EXCEPTION_EVENT_ESP_NEEDS_FRAGMENTATION, - SFE_IPV6_EXCEPTION_EVENT_ESP_SMALL_TTL, - SFE_IPV6_EXCEPTION_EVENT_LAST -}; - -/* - * Per CPU stats - */ -struct sfe_ipv6_stats { - /* - * Stats recorded in a sync period. These stats will be added to - * connection_xxx64 after a sync period. - */ - u64 connection_create_requests64; - /* Number of IPv6 connection create requests */ - u64 connection_create_collisions64; - /* Number of IPv6 connection create requests that collided with existing hash table entries */ - u64 connection_create_failures64; - /* Number of IPv6 connection create requests failures. */ - - u64 connection_destroy_requests64; - /* Number of IPv6 connection destroy requests */ - u64 connection_destroy_misses64; - /* Number of IPv6 connection destroy requests that missed our hash table */ - u64 connection_match_hash_hits64; - /* Number of IPv6 connection match hash hits */ - u64 connection_match_hash_reorders64; - /* Number of IPv6 connection match hash reorders */ - u64 connection_flushes64; /* Number of IPv6 connection flushes */ - u64 packets_dropped64; /* Number of IPv4 packets dropped */ - u64 packets_forwarded64; /* Number of IPv6 packets forwarded */ - u64 packets_fast_xmited64; /* Number of IPv6 packets fast transmited */ - u64 packets_not_forwarded64; /* Number of IPv6 packets not forwarded */ - u64 exception_events64[SFE_IPV6_EXCEPTION_EVENT_LAST]; - u64 pppoe_encap_packets_forwarded64; /* Number of IPv6 PPPoE encap packets forwarded */ - u64 pppoe_decap_packets_forwarded64; /* Number of IPv6 PPPoE decap packets forwarded */ - u64 pppoe_bridge_packets_forwarded64; /* Number of IPv6 PPPoE decap packets forwarded */ - u64 pppoe_bridge_packets_3tuple_forwarded64; /* Number of IPv6 PPPoE bridge packets forwarded based on 3-tuple info */ -}; - -/* - * sfe_ipv6_per_service_class_stats - * Per service class stats - */ -struct sfe_ipv6_per_service_class_stats { - u64 tx_bytes; /* Byte count */ - u64 tx_packets; /* Packet count */ - seqcount_t seq; /* seq lock for read/write protection */ - /* - * TODO : Add the entries to be maintained later. - */ -}; - -/* - * sfe_ipv6_service_class_stats_db - * Stat entries for each service class. - */ -struct sfe_ipv6_service_class_stats_db{ - struct sfe_ipv6_per_service_class_stats psc_stats[SFE_MAX_SERVICE_CLASS_ID]; - /* Per service class stats */ -}; - -/* - * Per-module structure. - */ -struct sfe_ipv6 { - spinlock_t lock; /* Lock for SMP correctness */ - struct sfe_ipv6_connection *all_connections_head; - /* Head of the list of all connections */ - struct sfe_ipv6_connection *all_connections_tail; - /* Tail of the list of all connections */ - unsigned int num_connections; /* Number of connections */ - struct delayed_work sync_dwork; /* Work to sync the statistics */ - unsigned int work_cpu; /* The core to run stats sync on */ - - sfe_sync_rule_callback_t __rcu sync_rule_callback; - /* Callback function registered by a connection manager for stats syncing */ - sfe_ipv6_many_sync_callback_t __rcu many_sync_callback; - /* Callback function registered by a connection manager for many stats syncing */ - struct sfe_ipv6_connection *conn_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; - /* Connection hash table */ - struct hlist_head hlist_conn_match_hash_head[SFE_IPV6_CONNECTION_HASH_SIZE]; -#ifdef CONFIG_NF_FLOW_COOKIE - struct sfe_ipv6_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; - /* flow cookie table*/ - sfe_ipv6_flow_cookie_set_func_t flow_cookie_set_func; - /* function used to configure flow cookie in hardware*/ - int flow_cookie_enable; - /* Enable/disable flow cookie at runtime */ -#endif - struct sfe_ipv6_service_class_stats_db __percpu *stats_pcpu_psc; - /* Database to maintain per cpu per service class statistics */ - - struct sfe_ipv6_stats __percpu *stats_pcpu; - /* Common SFE counters. */ - - struct sfe_ipv6_connection *wc_next; - /* The next walk point in the all connection list*/ - - /* - * Control state. - */ - struct kobject *sys_ipv6; /* sysfs linkage */ - int debug_dev; /* Major number of the debug char device */ - u32 debug_read_seq; /* sequence number for debug dump */ -}; - -/* - * Enumeration of the XML output. - */ -enum sfe_ipv6_debug_xml_states { - SFE_IPV6_DEBUG_XML_STATE_START, - SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_START, - SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, - SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_END, - SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_START, - SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, - SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_END, - SFE_IPV6_DEBUG_XML_STATE_STATS, - SFE_IPV6_DEBUG_XML_STATE_END, - SFE_IPV6_DEBUG_XML_STATE_DONE -}; - -/* - * XML write state. - */ -struct sfe_ipv6_debug_xml_write_state { - enum sfe_ipv6_debug_xml_states state; - /* XML output file state machine state */ - int iter_exception; /* Next exception iterator */ -}; - -typedef bool (*sfe_ipv6_debug_xml_write_method_t)(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, - int *total_read, struct sfe_ipv6_debug_xml_write_state *ws); - -/* - * sfe_ipv6_is_ext_hdr() - * check if we recognize ipv6 extension header - */ -static inline bool sfe_ipv6_is_ext_hdr(u8 hdr) -{ - return (hdr == NEXTHDR_HOP) || - (hdr == NEXTHDR_ROUTING) || - (hdr == NEXTHDR_FRAGMENT) || - (hdr == NEXTHDR_AUTH) || - (hdr == NEXTHDR_DEST) || - (hdr == NEXTHDR_MOBILITY); -} - -/* - * sfe_ipv6_change_dsfield() - * change dscp field in IPv6 packet - */ -static inline void sfe_ipv6_change_dsfield(struct ipv6hdr *iph, u8 dscp) -{ - __be16 *p = (__be16 *)iph; - - *p = ((*p & htons(SFE_IPV6_DSCP_MASK)) | htons((u16)dscp << 4)); -} - -bool sfe_ipv6_service_class_stats_get(uint8_t sid, uint64_t *bytes, uint64_t *packets); -void sfe_ipv6_exception_stats_inc(struct sfe_ipv6 *si, enum sfe_ipv6_exception_events reason); -void sfe_ipv6_service_class_stats_inc(struct sfe_ipv6 *si, uint8_t sid, uint64_t bytes); -struct sfe_ipv6_connection_match * -sfe_ipv6_find_connection_match_rcu(struct sfe_ipv6 *si, struct net_device *dev, u8 protocol, - struct sfe_ipv6_addr *src_ip, __be16 src_port, - struct sfe_ipv6_addr *dest_ip, __be16 dest_port); - -bool sfe_ipv6_remove_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c); - -void sfe_ipv6_flush_connection(struct sfe_ipv6 *si, - struct sfe_ipv6_connection *c, - sfe_sync_reason_t reason); - -void sfe_ipv6_sync_status(struct sfe_ipv6 *si, - struct sfe_ipv6_connection *c, - sfe_sync_reason_t reason); - -void sfe_ipv6_exit(void); -int sfe_ipv6_init(void); - -#endif /* __SFE_IPV6_H */ diff --git a/shortcut-fe/sfe_ipv6_esp.c b/shortcut-fe/sfe_ipv6_esp.c deleted file mode 100644 index 7a152e820..000000000 --- a/shortcut-fe/sfe_ipv6_esp.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * sfe_ipv6_esp.c - * Shortcut forwarding engine - IPv6 ESP implementation - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" -#include "sfe_ipv6_esp.h" - -/* - * sfe_ipv6_recv_esp() - * Handle ESP packet receives and forwarding - */ -int sfe_ipv6_recv_esp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, - bool sync_on_find, bool tun_outer) -{ - struct sfe_ipv6_connection_match *cm; - struct sfe_ipv6_addr *src_ip; - struct sfe_ipv6_addr *dest_ip; - struct net_device *xmit_dev; - struct inet6_protocol *ipprot; - netdev_features_t features; - bool bridge_flow; - bool passthrough; - bool fast_xmit; - bool ret; - - /* - * Read the IP address from the iphdr, and set the src/dst ports to 0. - */ - src_ip = (struct sfe_ipv6_addr *)iph->saddr.s6_addr32; - dest_ip = (struct sfe_ipv6_addr *)iph->daddr.s6_addr32; - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_ESP, src_ip, 0, dest_ip, 0); - } -#else - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_ESP, src_ip, 0, dest_ip, 0); -#endif - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ESP_NO_CONNECTION); - - DEBUG_TRACE("no connection found for esp packet\n"); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - struct sfe_ipv6_connection *c = cm->connection; - int ret; - - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("flush on wrong source interface check failure\n"); - return 0; - } - - passthrough = cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH; - bridge_flow = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * If our packet has beern marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before exception it to slow path. unless - * it is passthrough packet. - * TODO: revisit to ensure that pass through traffic is not bypassing firewall for fragmented cases - */ - if (unlikely(sync_on_find) && !passthrough) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ESP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("Sync on find\n"); - return 0; - } - - /* - * Check if skb was cloned. If it was, unshare it. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 0; - } - - /* - * Update the iphdr pointer with the unshared skb's data area. - */ - iph = (struct ipv6hdr *)skb->data; - } - - /* - * proto decap packet. - * Invoke the inet_protocol handler for delivery of the packet. - */ - ipprot = rcu_dereference(cm->proto); - if (likely(ipprot)) { - skb_reset_network_header(skb); - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - ret = ipprot->handler(skb); - if (ret) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - DEBUG_TRACE("ESP handler returned error %u\n", ret); - return 0; - } - - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - return 1; - } - - /* - * esp passthrough / ip local out scenarios - */ - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely(len > cm->xmit_dev_mtu)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ESP_NEEDS_FRAGMENTATION); - DEBUG_TRACE("Larger than MTU\n"); - return 0; - } - - /* - * need to ensure that TTL is >=2. - */ - if (!bridge_flow && (iph->hop_limit < 2) && passthrough) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ESP_SMALL_TTL); - DEBUG_TRACE("hop_limit too low\n"); - return 0; - } - - /* - * decrement TTL by 1. - */ - iph->hop_limit = iph->hop_limit - (u8)(!bridge_flow && !tun_outer); - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - sfe_ipv6_change_dsfield(iph, cm->dscp); - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * write the layer - 2 header. - */ - if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ETH_P_IPV6, cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = htons(ETH_P_IPV6); - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - - features = cm->features; - fast_xmit = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit && dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - dev_queue_xmit(skb); - return 1; -} diff --git a/shortcut-fe/sfe_ipv6_esp.h b/shortcut-fe/sfe_ipv6_esp.h deleted file mode 100644 index 287067097..000000000 --- a/shortcut-fe/sfe_ipv6_esp.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * sfe_ipv6_esp.h - * Shortcut forwarding engine - IPv6 ESP header file - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv6_recv_esp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, unsigned int len, - struct ipv6hdr *iph, unsigned int ihl, bool sync_on_find, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv6_gre.c b/shortcut-fe/sfe_ipv6_gre.c deleted file mode 100644 index eae7d56d7..000000000 --- a/shortcut-fe/sfe_ipv6_gre.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * sfe_ipv6_gre.c - * Shortcut forwarding engine file for IPv6 GRE - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv6_recv_gre() - * Handle GRE packet receives and forwarding. - */ -int sfe_ipv6_recv_gre(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, bool sync_on_find, - struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct sfe_ipv6_connection_match *cm; - struct sfe_ipv6_addr *dest_ip; - struct sfe_ipv6_addr *src_ip; - struct net_device *xmit_dev; - bool bridge_flow; - bool passthrough; - bool ret; - - /* - * Is our packet too short to contain a valid UDP header? - */ - if (!pskb_may_pull(skb, (sizeof(struct gre_base_hdr) + ihl))) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_GRE_HEADER_INCOMPLETE); - DEBUG_TRACE("packet too short for GRE header\n"); - return 0; - } - - /* - * Read the IP address and port information. Read the IP header data first - * because we've almost certainly got that in the cache. We may not yet have - * the UDP header cached though so allow more time for any prefetching. - */ - src_ip = (struct sfe_ipv6_addr *)iph->saddr.s6_addr32; - dest_ip = (struct sfe_ipv6_addr *)iph->daddr.s6_addr32; - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_GRE, src_ip, 0, dest_ip, 0); - } -#else - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_GRE, src_ip, 0, dest_ip, 0); -#endif - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_GRE_NO_CONNECTION); - DEBUG_TRACE("no connection match found dev %s src ip %pI6 dest ip %pI6\n", dev->name, src_ip, dest_ip); - return 0; - } - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv6_connection *c = cm->connection; - int ret; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - passthrough = cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PASSTHROUGH; - - /* - * If our packet has beern marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before exception it to slow path. unless - * it is passthrough packet. - * TODO: revisit to ensure that pass through traffic is not bypassing firewall for fragmented cases - */ - if (unlikely(sync_on_find) && !passthrough) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_GRE_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("Sync on find\n"); - return 0; - } - - bridge_flow = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * Does our hop_limit allow forwarding? - */ - if (!bridge_flow && (iph->hop_limit < 2) && passthrough) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_GRE_SMALL_TTL); - DEBUG_TRACE("hop_limit too low\n"); - return 0; - } - - /* - * Check if skb was cloned. If it was, unshare it. Because - * the data area is going to be written in this path and we don't want to - * change the cloned skb's data section. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 1; - } - - /* - * Update the iph and udph pointers with the unshared skb's data area. - */ - iph = (struct ipv6hdr *)skb->data; - } - - /* - * For PPPoE packets, match server MAC and session id - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP)) { - struct ethhdr *eth; - bool pppoe_match; - - if (unlikely(!sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: PPPoE header not present in packet for PPPoE rule\n", skb); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING); - return 0; - } - - eth = eth_hdr(skb); - - pppoe_match = (cm->pppoe_session_id == sfe_l2_pppoe_session_id_get(l2_info)) && - ether_addr_equal((u8*)cm->pppoe_remote_mac, (u8 *)eth->h_source); - - if (unlikely(!pppoe_match)) { - DEBUG_TRACE("%px: PPPoE sessions ID %d and %d or MAC %pM and %pM did not match\n", - skb, cm->pppoe_session_id, sfe_l2_pppoe_session_id_get(l2_info), - cm->pppoe_remote_mac, eth->h_source); - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_PPPOE_SESSION); - return 0; - } - - skb->protocol = htons(l2_info->protocol); - this_cpu_inc(si->stats_pcpu->pppoe_decap_packets_forwarded64); - } else if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - - /* - * If packet contains PPPoE header but CME doesn't contain PPPoE flag yet we are exceptioning the packet to linux - */ - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: CME doesn't contain PPPoE flag but packet has PPPoE header\n", skb); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME); - return 0; - - } - - /* - * For bridged flows when packet contains PPPoE header, restore the header back and forward to xmit interface - */ - __skb_push(skb, (sizeof(struct pppoe_hdr) + sizeof(struct sfe_ppp_hdr))); - - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_forwarded64); - } - - /* - * protocol handler will be valid only in decap-path. - */ - if (cm->proto) { - struct inet6_protocol *ipprot = cm->proto; - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - skb->fast_forwarded = 1; - - ret = ipprot->handler(skb); - if (ret) { - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - rcu_read_unlock(); - return 1; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - rcu_read_unlock(); - DEBUG_TRACE("%p: %s decap done\n",skb, __func__); - return 1; - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely(len > cm->xmit_dev_mtu)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_GRE_NEEDS_FRAGMENTATION); - DEBUG_TRACE("Larger than MTU\n"); - return 0; - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - sfe_ipv6_change_dsfield(iph, cm->dscp); - } - - iph->hop_limit -= (u8)(!bridge_flow & !tun_outer); - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * For PPPoE flows, add PPPoE header before L2 header is added. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - sfe_pppoe_add_header(skb, cm->pppoe_session_id, PPP_IPV6); - this_cpu_inc(si->stats_pcpu->pppoe_encap_packets_forwarded64); - } - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR) { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } else if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - } - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv6_gre.h b/shortcut-fe/sfe_ipv6_gre.h deleted file mode 100644 index dce2176f4..000000000 --- a/shortcut-fe/sfe_ipv6_gre.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv6_gre.h - * Shortcut forwarding engine header file for IPv6 GRE - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv6_recv_gre(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, bool sync_on_find, - struct sfe_l2_info *l2_info, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv6_icmp.c b/shortcut-fe/sfe_ipv6_icmp.c deleted file mode 100644 index a28742c0e..000000000 --- a/shortcut-fe/sfe_ipv6_icmp.c +++ /dev/null @@ -1,207 +0,0 @@ -/* - * sfe_ipv6_icmp.c - * Shortcut forwarding engine file for IPv6 ICMP - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" - -/* - * sfe_ipv6_recv_icmp() - * Handle ICMP packet receives. - * - * ICMP packets aren't handled as a "fast path" and always have us process them - * through the default Linux stack. What we do need to do is look for any errors - * about connections we are handling in the fast path. If we find any such - * connections then we want to flush their state so that the ICMP error path - * within Linux has all of the correct state should it need it. - */ -int sfe_ipv6_recv_icmp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl) -{ - struct icmp6hdr *icmph; - struct ipv6hdr *icmp_iph; - struct udphdr *icmp_udph; - struct tcphdr *icmp_tcph; - struct sfe_ipv6_addr *src_ip; - struct sfe_ipv6_addr *dest_ip; - __be16 src_port; - __be16 dest_port; - struct sfe_ipv6_connection_match *cm; - struct sfe_ipv6_connection *c; - u8 next_hdr; - bool ret; - - /* - * Is our packet too short to contain a valid ICMP header? - */ - len -= ihl; - if (!pskb_may_pull(skb, ihl + sizeof(struct icmp6hdr))) { - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE); - - DEBUG_TRACE("packet too short for ICMP header\n"); - return 0; - } - - /* - * We only handle "destination unreachable" and "time exceeded" messages. - */ - icmph = (struct icmp6hdr *)(skb->data + ihl); - if ((icmph->icmp6_type != ICMPV6_DEST_UNREACH) - && (icmph->icmp6_type != ICMPV6_TIME_EXCEED)) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE); - DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->icmp6_type); - return 0; - } - - /* - * Do we have the full embedded IP header? - * We should have 8 bytes of next L4 header - that's enough to identify - * the connection. - */ - len -= sizeof(struct icmp6hdr); - ihl += sizeof(struct icmp6hdr); - if (!pskb_may_pull(skb, ihl + sizeof(struct ipv6hdr) + sizeof(struct sfe_ipv6_ext_hdr))) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE); - DEBUG_TRACE("Embedded IP header not complete\n"); - return 0; - } - - /* - * Is our embedded IP version wrong? - */ - icmp_iph = (struct ipv6hdr *)(icmph + 1); - if (unlikely(icmp_iph->version != 6)) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6); - DEBUG_TRACE("IP version: %u\n", icmp_iph->version); - return 0; - } - - len -= sizeof(struct ipv6hdr); - ihl += sizeof(struct ipv6hdr); - next_hdr = icmp_iph->nexthdr; - while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { - struct sfe_ipv6_ext_hdr *ext_hdr; - unsigned int ext_hdr_len; - - ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); - if (next_hdr == NEXTHDR_FRAGMENT) { - struct frag_hdr *frag_hdr = (struct frag_hdr *)ext_hdr; - unsigned int frag_off = ntohs(frag_hdr->frag_off); - - if (frag_off & SFE_IPV6_FRAG_OFFSET) { - - DEBUG_TRACE("non-initial fragment\n"); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT); - return 0; - } - } - - ext_hdr_len = ext_hdr->hdr_len; - ext_hdr_len <<= 3; - ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); - len -= ext_hdr_len; - ihl += ext_hdr_len; - /* - * We should have 8 bytes of next header - that's enough to identify - * the connection. - */ - if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE); - DEBUG_TRACE("extension header %d not completed\n", next_hdr); - return 0; - } - - next_hdr = ext_hdr->next_hdr; - } - - /* - * Handle the embedded transport layer header. - */ - switch (next_hdr) { - case IPPROTO_UDP: - icmp_udph = (struct udphdr *)(skb->data + ihl); - src_port = icmp_udph->source; - dest_port = icmp_udph->dest; - break; - - case IPPROTO_TCP: - icmp_tcph = (struct tcphdr *)(skb->data + ihl); - src_port = icmp_tcph->source; - dest_port = icmp_tcph->dest; - break; - - default: - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL); - DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", next_hdr); - return 0; - } - - src_ip = (struct sfe_ipv6_addr *)icmp_iph->saddr.s6_addr32; - dest_ip = (struct sfe_ipv6_addr *)icmp_iph->daddr.s6_addr32; - - rcu_read_lock(); - /* - * Look for a connection match. Note that we reverse the source and destination - * here because our embedded message contains a packet that was sent in the - * opposite direction to the one in which we just received it. It will have - * been sent on the interface from which we received it though so that's still - * ok to use. - */ - cm = sfe_ipv6_find_connection_match_rcu(si, dev, icmp_iph->nexthdr, dest_ip, dest_port, src_ip, src_port); - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION); - DEBUG_TRACE("no connection found\n"); - return 0; - } - - /* - * We found a connection so now remove it from the connection list and flush - * its state. - */ - c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION); - return 0; -} diff --git a/shortcut-fe/sfe_ipv6_icmp.h b/shortcut-fe/sfe_ipv6_icmp.h deleted file mode 100644 index c1be02ce6..000000000 --- a/shortcut-fe/sfe_ipv6_icmp.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv6_icmp.h - * Shortcut forwarding engine header file for IPv6 ICMP - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv6_recv_icmp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl); diff --git a/shortcut-fe/sfe_ipv6_pppoe_br.c b/shortcut-fe/sfe_ipv6_pppoe_br.c deleted file mode 100644 index f3c80b786..000000000 --- a/shortcut-fe/sfe_ipv6_pppoe_br.c +++ /dev/null @@ -1,207 +0,0 @@ -/* - * sfe_ipv6_pppoe_br.c - * Shortcut forwarding engine - IPv6 PPPoE bridge implementation - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_ipv6.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv6_recv_pppoe_bridge() - * Process PPPoE bridge packets using 3-tuple acceleration - * - */ -int sfe_ipv6_recv_pppoe_bridge(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, struct sfe_l2_info *l2_info) -{ - struct sfe_ipv6_connection_match *cm; - u32 service_class_id; - struct net_device *xmit_dev; - int ret; - bool fast_xmit; - netdev_features_t features; - - rcu_read_lock(); - - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_RAW, - (struct sfe_ipv6_addr *)iph->saddr.s6_addr32, 0, - (struct sfe_ipv6_addr *)iph->daddr.s6_addr32, - htons(sfe_l2_pppoe_session_id_get(l2_info))); - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_PPPOE_BR_NOT_IN_CME); - DEBUG_TRACE("%px: no connection found in 3-tuple lookup for PPPoE bridge flow\n", skb); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv6_connection *c = cm->connection; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * Restore PPPoE header back - */ - __skb_push(skb, PPPOE_SES_HLEN); - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write an Ethernet header. - */ - if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - /* - * Update service class stats if SAWF is valid. - */ - if (likely(cm->sawf_valid)) { - service_class_id = SFE_GET_SAWF_SERVICE_CLASS(cm->mark); - sfe_ipv6_service_class_stats_inc(si, service_class_id, len); - } - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - features = cm->features; - - fast_xmit = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_3tuple_forwarded64); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit && dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv6_pppoe_br.h b/shortcut-fe/sfe_ipv6_pppoe_br.h deleted file mode 100644 index 84dc31372..000000000 --- a/shortcut-fe/sfe_ipv6_pppoe_br.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * sfe_ipv6_pppoe_br.h - * Shortcut forwarding engine - IPv6 PPPoE bridge header file - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -int sfe_ipv6_recv_pppoe_bridge(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, struct sfe_l2_info *l2_info); diff --git a/shortcut-fe/sfe_ipv6_tcp.c b/shortcut-fe/sfe_ipv6_tcp.c deleted file mode 100644 index 0cf867f2c..000000000 --- a/shortcut-fe/sfe_ipv6_tcp.c +++ /dev/null @@ -1,765 +0,0 @@ -/* - * sfe_ipv6_tcp.c - * Shortcut forwarding engine file for IPv6 TCP - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv6_process_tcp_option_sack() - * Parse TCP SACK option and update ack according - */ -static bool sfe_ipv6_process_tcp_option_sack(const struct tcphdr *th, const u32 data_offs, - u32 *ack) -{ - u32 length = sizeof(struct tcphdr); - u8 *ptr = (u8 *)th + length; - - /* - * Ignore processing if TCP packet has only TIMESTAMP option. - */ - if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) - && likely(ptr[0] == TCPOPT_NOP) - && likely(ptr[1] == TCPOPT_NOP) - && likely(ptr[2] == TCPOPT_TIMESTAMP) - && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { - return true; - } - - /* - * TCP options. Parse SACK option. - */ - while (length < data_offs) { - u8 size; - u8 kind; - - ptr = (u8 *)th + length; - kind = *ptr; - - /* - * NOP, for padding - * Not in the switch because to fast escape and to not calculate size - */ - if (kind == TCPOPT_NOP) { - length++; - continue; - } - - if (kind == TCPOPT_SACK) { - u32 sack = 0; - u8 re = 1 + 1; - - size = *(ptr + 1); - if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) - || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) - || (size > (data_offs - length))) { - return false; - } - - re += 4; - while (re < size) { - u32 sack_re; - u8 *sptr = ptr + re; - sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; - if (sack_re > sack) { - sack = sack_re; - } - re += TCPOLEN_SACK_PERBLOCK; - } - if (sack > *ack) { - *ack = sack; - } - length += size; - continue; - } - if (kind == TCPOPT_EOL) { - return true; - } - size = *(ptr + 1); - if (size < 2) { - return false; - } - length += size; - } - - return true; -} - -/* - * sfe_ipv6_recv_tcp() - * Handle TCP packet receives and forwarding. - */ -int sfe_ipv6_recv_tcp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, bool sync_on_find, struct sfe_l2_info *l2_info) -{ - struct tcphdr *tcph; - struct sfe_ipv6_addr *src_ip; - struct sfe_ipv6_addr *dest_ip; - __be16 src_port; - __be16 dest_port; - struct sfe_ipv6_connection_match *cm; - struct sfe_ipv6_connection_match *counter_cm; - u32 flags; - u32 service_class_id; - struct net_device *xmit_dev; - bool ret; - bool hw_csum; - bool bridge_flow; - bool fast_xmit; - netdev_features_t features; - - /* - * Is our packet too short to contain a valid TCP header? - */ - if (!pskb_may_pull(skb, (sizeof(struct tcphdr) + ihl))) { - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE); - DEBUG_TRACE("packet too short for TCP header\n"); - return 0; - } - - /* - * Read the IP address and port information. Read the IP header data first - * because we've almost certainly got that in the cache. We may not yet have - * the TCP header cached though so allow more time for any prefetching. - */ - src_ip = (struct sfe_ipv6_addr *)iph->saddr.s6_addr32; - dest_ip = (struct sfe_ipv6_addr *)iph->daddr.s6_addr32; - - tcph = (struct tcphdr *)(skb->data + ihl); - src_port = tcph->source; - dest_port = tcph->dest; - flags = tcp_flag_word(tcph); - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); - } -#else - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); -#endif - if (unlikely(!cm)) { - /* - * We didn't get a connection but as TCP is connection-oriented that - * may be because this is a non-fast connection (not running established). - * For diagnostic purposes we differentiate this here. - */ - if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS); - - DEBUG_TRACE("no connection found - fast flags\n"); - return 0; - } - - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS); - DEBUG_TRACE("no connection found - slow flags: 0x%x\n", - flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv6_connection *c = cm->connection; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - /* - * If our packet has been marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before throw it slow path. - */ - if (unlikely(sync_on_find)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("Sync on find\n"); - return 0; - } - -#ifdef CONFIG_XFRM - /* - * We can't accelerate the flow on this direction, just let it go - * through the slow path. - */ - if (unlikely(!cm->flow_accel)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - return 0; - } -#endif - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - bridge_flow = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * Does our hop_limit allow forwarding? - */ - if (likely(!bridge_flow)) { - if (unlikely(iph->hop_limit < 2)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL); - DEBUG_TRACE("hop_limit too low\n"); - return 0; - } - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION); - DEBUG_TRACE("Larger than MTU\n"); - return 0; - } - - /* - * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN - * set is not a fast path packet. - */ - if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP flags: %#x are not fast. %u->%u skb=%px\n", - htonl(flags), htons(src_port), htons(dest_port), skb); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS); - return 0; - } - - counter_cm = cm->counter_match; - - /* - * Are we doing sequence number checking? - */ - if (likely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { - u32 seq; - u32 ack; - u32 sack; - u32 data_offs; - u32 end; - u32 left_edge; - u32 scaled_win; - u32 max_end; - - /* - * Is our sequence fully past the right hand edge of the window? - */ - seq = ntohl(tcph->seq); - if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("seq: %u exceeds right edge: %u\n", - seq, cm->protocol_state.tcp.max_end + 1); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE); - return 0; - } - - /* - * Check that our TCP data offset isn't too short. - */ - data_offs = tcph->doff << 2; - if (unlikely(data_offs < sizeof(struct tcphdr))) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS); - return 0; - } - - /* - * Update ACK according to any SACK option. - */ - ack = ntohl(tcph->ack_seq); - sack = ack; - if (unlikely(!sfe_ipv6_process_tcp_option_sack(tcph, data_offs, &sack))) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP option SACK size is wrong\n"); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK); - return 0; - } - - /* - * Check that our TCP data offset isn't past the end of the packet. - */ - data_offs += sizeof(struct ipv6hdr); - if (unlikely(len < data_offs)) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", - data_offs, len); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS); - return 0; - } - - end = seq + len - data_offs; - - /* - * Is our sequence fully before the left hand edge of the window? - */ - if (unlikely((s32)(end - (cm->protocol_state.tcp.end - - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("seq: %u before left edge: %u\n", - end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE); - return 0; - } - - /* - * Are we acking data that is to the right of what has been sent? - */ - if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("ack: %u exceeds right edge: %u\n", - sack, counter_cm->protocol_state.tcp.end + 1); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE); - return 0; - } - - /* - * Is our ack too far before the left hand edge of the window? - */ - left_edge = counter_cm->protocol_state.tcp.end - - cm->protocol_state.tcp.max_win - - SFE_IPV6_TCP_MAX_ACK_WINDOW - - 1; - if (unlikely((s32)(sack - left_edge) < 0)) { - struct sfe_ipv6_connection *c = cm->connection; - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE); - return 0; - } - - /* - * Have we just seen the largest window size yet for this connection? If yes - * then we need to record the new value. - */ - scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; - scaled_win += (sack - ack); - if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { - cm->protocol_state.tcp.max_win = scaled_win; - } - - /* - * If our sequence and/or ack numbers have advanced then record the new state. - */ - if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { - cm->protocol_state.tcp.end = end; - } - - max_end = sack + scaled_win; - if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { - counter_cm->protocol_state.tcp.max_end = max_end; - } - } - - /* - * Check if skb was cloned. If it was, unshare it. Because - * the data area is going to be written in this path and we don't want to - * change the cloned skb's data section. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 0; - } - - /* - * Update the iph and tcph pointers with the unshared skb's data area. - */ - iph = (struct ipv6hdr *)skb->data; - tcph = (struct tcphdr *)(skb->data + ihl); - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * For PPPoE packets, match server MAC and session id - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP)) { - struct ethhdr *eth; - bool pppoe_match; - - if (unlikely(!sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: PPPoE header not present in packet for PPPoE rule\n", skb); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING); - return 0; - } - - eth = eth_hdr(skb); - - pppoe_match = (cm->pppoe_session_id == sfe_l2_pppoe_session_id_get(l2_info)) && - ether_addr_equal((u8*)cm->pppoe_remote_mac, (u8 *)eth->h_source); - - if (unlikely(!pppoe_match)) { - DEBUG_TRACE("%px: PPPoE sessions ID %d and %d or MAC %pM and %pM did not match\n", - skb, cm->pppoe_session_id, sfe_l2_pppoe_session_id_get(l2_info), - cm->pppoe_remote_mac, eth->h_source); - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_PPPOE_SESSION); - return 0; - } - - skb->protocol = htons(l2_info->protocol); - this_cpu_inc(si->stats_pcpu->pppoe_decap_packets_forwarded64); - } else if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - - /* - * If packet contains PPPoE header but CME doesn't contain PPPoE flag yet we are exceptioning the packet to linux - */ - if (unlikely(!bridge_flow)) { - rcu_read_unlock(); - DEBUG_TRACE("%px: CME doesn't contain PPPoE flag but packet has PPPoE header\n", skb); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME); - return 0; - - } - - /* - * For bridged flows when packet contains PPPoE header, restore the header back and forward to xmit interface - */ - __skb_push(skb, PPPOE_SES_HLEN); - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_forwarded64); - } - - /* - * From this point on we're good to modify the packet. - */ - - /* - * For PPPoE flows, add PPPoE header before L2 header is added. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - sfe_pppoe_add_header(skb, cm->pppoe_session_id, PPP_IPV6); - this_cpu_inc(si->stats_pcpu->pppoe_encap_packets_forwarded64); - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - sfe_ipv6_change_dsfield(iph, cm->dscp); - } - - /* - * Decrement our hop_limit. - */ - if (likely(!bridge_flow)) { - iph->hop_limit -= 1; - } - - /* - * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable. - * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here - * so that HW does not re-calculate/replace the L4 csum - */ - hw_csum = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY); - - /* - * Do we have to perform translations of the source address/port? - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { - u16 tcp_csum; - u32 sum; - - iph->saddr.s6_addr32[0] = cm->xlate_src_ip[0].addr[0]; - iph->saddr.s6_addr32[1] = cm->xlate_src_ip[0].addr[1]; - iph->saddr.s6_addr32[2] = cm->xlate_src_ip[0].addr[2]; - iph->saddr.s6_addr32[3] = cm->xlate_src_ip[0].addr[3]; - tcph->source = cm->xlate_src_port; - - if (unlikely(!hw_csum)) { - tcp_csum = tcph->check; - sum = tcp_csum + cm->xlate_src_csum_adjustment; - sum = (sum & 0xffff) + (sum >> 16); - tcph->check = (u16)sum; - } - } - - /* - * Do we have to perform translations of the destination address/port? - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { - u16 tcp_csum; - u32 sum; - - iph->daddr.s6_addr32[0] = cm->xlate_dest_ip[0].addr[0]; - iph->daddr.s6_addr32[1] = cm->xlate_dest_ip[0].addr[1]; - iph->daddr.s6_addr32[2] = cm->xlate_dest_ip[0].addr[2]; - iph->daddr.s6_addr32[3] = cm->xlate_dest_ip[0].addr[3]; - tcph->dest = cm->xlate_dest_port; - - if (unlikely(!hw_csum)) { - tcp_csum = tcph->check; - sum = tcp_csum + cm->xlate_dest_csum_adjustment; - sum = (sum & 0xffff) + (sum >> 16); - tcph->check = (u16)sum; - } - } - - /* - * If HW checksum offload is not possible, incremental L4 checksum is used to update the packet. - * Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is not recalculated further in packet - * path. - */ - if (likely(hw_csum)) { - skb->ip_summed = CHECKSUM_PARTIAL; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write an Ethernet header. - */ - if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - /* - * Update service class stats if SAWF is valid. - */ - if (likely(cm->sawf_valid)) { - service_class_id = SFE_GET_SAWF_SERVICE_CLASS(cm->mark); - sfe_ipv6_service_class_stats_inc(si, service_class_id, len); - } - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - features = cm->features; - - fast_xmit = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit)) { - if (likely(!skb_is_gso(skb))) { - if (likely(dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - } else { - cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - DEBUG_TRACE("%px: fast xmit disabled for xmit dev %s", skb, xmit_dev->name); - } - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv6_tcp.h b/shortcut-fe/sfe_ipv6_tcp.h deleted file mode 100644 index fa1a20e46..000000000 --- a/shortcut-fe/sfe_ipv6_tcp.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * sfe_ipv6_tcp.h - * Shortcut forwarding engine header file for IPv6 TCP - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv6_recv_tcp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, - bool sync_on_find,struct sfe_l2_info *l2_info); diff --git a/shortcut-fe/sfe_ipv6_tunipip6.c b/shortcut-fe/sfe_ipv6_tunipip6.c deleted file mode 100644 index 8690f2642..000000000 --- a/shortcut-fe/sfe_ipv6_tunipip6.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * sfe_ipv6_tunipip6.c - * Shortcut forwarding engine file for IPv6 TUNIPIP6 - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv6_recv_tunipip6() - * Handle TUNIPIP6 packet receives and forwarding. - */ -int sfe_ipv6_recv_tunipip6(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, - bool sync_on_find, struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct sfe_ipv6_addr *src_ip; - struct sfe_ipv6_addr *dest_ip; - __be16 src_port = 0; - __be16 dest_port = 0; - unsigned int ihl_tmp = sizeof(struct ipv6hdr); - struct sfe_ipv6_connection_match *cm; - bool non_dst = false; - u8 next_hdr; - - DEBUG_TRACE("%px: sfe: sfe_ipv6_recv_tunipip6 called.\n", skb); - - /* - * Read the IP address information. Read the IP header data first - * because we've almost certainly got that in the cache. - */ - src_ip = (struct sfe_ipv6_addr *)iph->saddr.s6_addr32; - dest_ip = (struct sfe_ipv6_addr *)iph->daddr.s6_addr32; - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_IPIP, src_ip, src_port, dest_ip, dest_port); - } -#else - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_IPIP, src_ip, src_port, dest_ip, dest_port); -#endif - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_NO_CONNECTION); - DEBUG_TRACE("%px: no connection found\n", skb); - return 0; - } - - next_hdr = iph->nexthdr; - - /* - * Try to find an extension header(if any) that is not NEXTHDR_DEST. - */ - while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { - struct sfe_ipv6_ext_hdr *ext_hdr; - unsigned int ext_hdr_len; - - if(next_hdr != NEXTHDR_DEST) { - non_dst = true; - break; - } - - ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl_tmp); - - ext_hdr_len = ext_hdr->hdr_len; - ext_hdr_len <<= 3; - ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); - ihl_tmp += ext_hdr_len; - - next_hdr = ext_hdr->next_hdr; - } - - /* - * If our packet has been marked as "sync on find" we will sync the status - * and forward it to slowpath, except that encap_limit is set for dslite tunnel - * which is embedded in exthdr type NEXTHDR_DEST. - */ - if (unlikely(sync_on_find && non_dst)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_SYNC_ON_FIND); - DEBUG_TRACE("%px: Sync on find\n", skb); - - return 0; - } - - /* - * If cm->proto is set, it means the decap path. - * Otherwise we forward the packet in encap path. - */ - if(cm->proto) { -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - const struct inet6_protocol *ipprot = cm->proto; -#else - struct inet6_protocol *ipprot = cm->proto; -#endif - - /* - * Do we expect an ingress VLAN tag for this flow? - * Note: We will only have ingress tag check in decap direction. - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n" - "cm: %u [0]=%x/%x [1]=%x/%x\n" - "l2_info+: %u [0]=%x/%x [1]=%x/%x\n", skb, - cm->ingress_vlan_hdr_cnt, - htons(cm->ingress_vlan_hdr[0].tpid), cm->ingress_vlan_hdr[0].tci, - htons(cm->ingress_vlan_hdr[1].tpid), cm->ingress_vlan_hdr[1].tci, - l2_info->vlan_hdr_cnt, - htons(l2_info->vlan_hdr[0].tpid), l2_info->vlan_hdr[0].tci, - htons(l2_info->vlan_hdr[1].tpid), l2_info->vlan_hdr[1].tci); - return 0; - } - skb_reset_network_header(skb); - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - - /* - * ipprot->handler(skb) will always return 0; - * There is no way to tell whether the packet is dropped later in linux or not. - * Hence here inc the byte/packet count always. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - rcu_read_unlock(); - DEBUG_TRACE("%px: %s decap done \n",skb, __func__); - - /* - * Update top interface for tunnel searching. - */ - skb->dev = cm->top_interface_dev; - ipprot->handler(skb); - return 1; - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely(len > cm->xmit_dev_mtu)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_TUNIPIP6_NEEDS_FRAGMENTATION); - DEBUG_TRACE("%px: Larger than mtu\n", skb); - return 0; - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - sfe_ipv6_change_dsfield(iph, cm->dscp); - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - skb->dev = cm->xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write a header. - */ - if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, cm->xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - } - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * Mark that this packet has been fast forwarded and send it on its way. - */ - skb->fast_forwarded = 1; - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv6_tunipip6.h b/shortcut-fe/sfe_ipv6_tunipip6.h deleted file mode 100644 index d6f4f9be7..000000000 --- a/shortcut-fe/sfe_ipv6_tunipip6.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * sfe_ipv6_tunipip6.h - * Shortcut forwarding engine header file for IPv6 TUNIPIP6 - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv6_recv_tunipip6(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, - bool sync_on_find, struct sfe_l2_info *l2_info, bool tun_outer); diff --git a/shortcut-fe/sfe_ipv6_udp.c b/shortcut-fe/sfe_ipv6_udp.c deleted file mode 100644 index c80f38b28..000000000 --- a/shortcut-fe/sfe_ipv6_udp.c +++ /dev/null @@ -1,589 +0,0 @@ -/* - * sfe_ipv6_udp.c - * Shortcut forwarding engine file for IPv6 UDP - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_flow_cookie.h" -#include "sfe_ipv6.h" -#include "sfe_pppoe.h" -#include "sfe_vlan.h" - -/* - * sfe_ipv6_udp_sk_deliver() - * Deliver the packet to the protocol handler registered with Linux. - * To be called under rcu_read_lock() - * Returns: - * 1 if the packet needs to be passed to Linux. - * 0 if the packet is processed successfully. - * -1 if the packet is dropped in SFE. - */ -static int sfe_ipv6_udp_sk_deliver(struct sk_buff *skb, struct sfe_ipv6_connection_match *cm, - unsigned int ihl) -{ - int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); - struct udp_sock *up; - struct udphdr *udph; - struct sock *sk; - int ret; - - /* - * Call the decap handler - */ - up = rcu_dereference(cm->up); - encap_rcv = READ_ONCE(up->encap_rcv); - if (unlikely(!encap_rcv)) { - DEBUG_ERROR("sfe: Error: up->encap_rcv is NULL\n"); - return 1; - } - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) - nf_reset(skb); -#else - nf_reset_ct(skb); -#endif - skb_pull(skb, ihl); - skb_reset_transport_header(skb); - - udph = udp_hdr(skb); - if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY) && unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { - /* - * Set Pseudo Checksum using Linux API - */ - if (unlikely(udp6_csum_init(skb, udp_hdr(skb), IPPROTO_UDP))) { - DEBUG_ERROR("sfe: udp checksum init() failed: %p\n", skb); - kfree_skb(skb); - return -1; - } - - /* - * Verify checksum before giving to encap_rcv handler function. - */ - if (unlikely(udp_lib_checksum_complete(skb))) { - DEBUG_ERROR("sfe: Invalid udp checksum: %p\n", skb); - kfree_skb(skb); - return -1; - } - } - - /* - * Mark that this packet has been fast forwarded. - */ - sk = (struct sock *)up; - - /* - * TODO: Find the fix to set skb->ip_summed = CHECKSUM_NONE; - */ - - /* - * encap_rcv() returns the following value: - * =0 if skb was successfully passed to the encap - * handler or was discarded by it. - * >0 if skb should be passed on to UDP. - * <0 if skb should be resubmitted as proto -N - */ - ret = encap_rcv(sk, skb); - if (unlikely(ret)) { - - /* - * If encap_rcv fails, vxlan driver drops the packet. - * No need to free the skb here. - */ - DEBUG_ERROR("sfe: udp-decap API return error: %d\n", ret); - return -1; - } - - DEBUG_TRACE("sfe: udp-decap API encap_rcv successful\n"); - return 0; -} - -/* - * sfe_ipv6_recv_udp() - * Handle UDP packet receives and forwarding. - */ -int sfe_ipv6_recv_udp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, bool sync_on_find, struct sfe_l2_info *l2_info, bool tun_outer) -{ - struct udphdr *udph; - struct sfe_ipv6_addr *src_ip; - struct sfe_ipv6_addr *dest_ip; - __be16 src_port; - __be16 dest_port; - u32 service_class_id; - struct sfe_ipv6_connection_match *cm; - struct net_device *xmit_dev; - int ret; - bool hw_csum; - bool bridge_flow; - bool fast_xmit; - netdev_features_t features; - - DEBUG_TRACE("%px: sfe: sfe_ipv6_recv_udp called.\n", skb); - - /* - * Is our packet too short to contain a valid UDP header? - */ - if (!pskb_may_pull(skb, (sizeof(struct udphdr) + ihl))) { - - sfe_ipv6_exception_stats_inc(si,SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE); - DEBUG_TRACE("packet too short for UDP header\n"); - return 0; - } - - /* - * Read the IP address and port information. Read the IP header data first - * because we've almost certainly got that in the cache. We may not yet have - * the UDP header cached though so allow more time for any prefetching. - */ - src_ip = (struct sfe_ipv6_addr *)iph->saddr.s6_addr32; - dest_ip = (struct sfe_ipv6_addr *)iph->daddr.s6_addr32; - - udph = (struct udphdr *)(skb->data + ihl); - src_port = udph->source; - dest_port = udph->dest; - - rcu_read_lock(); - - /* - * Look for a connection match. - */ -#ifdef CONFIG_NF_FLOW_COOKIE - cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; - if (unlikely(!cm)) { - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); - } -#else - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); -#endif - if (unlikely(!cm)) { - - /* - * Try a 4-tuple lookup; required for tunnels like VxLAN. - */ - cm = sfe_ipv6_find_connection_match_rcu(si, dev, IPPROTO_UDP, src_ip, 0, dest_ip, dest_port); - if (unlikely(!cm)) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION); - DEBUG_TRACE("no connection found\n"); - return 0; - } - DEBUG_TRACE("sfe: 4-tuple lookup successful\n"); - } - - /* - * Do we expect an ingress VLAN tag for this flow? - */ - if (unlikely(!sfe_vlan_validate_ingress_tag(skb, cm->ingress_vlan_hdr_cnt, cm->ingress_vlan_hdr, l2_info))) { - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INGRESS_VLAN_TAG_MISMATCH); - DEBUG_TRACE("VLAN tag mismatch. skb=%px\n", skb); - return 0; - } - - /* - * Source interface validate. - */ - if (unlikely((cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK) && (cm->match_dev != dev))) { - if (!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_SRC_INTERFACE_CHECK_NO_FLUSH)) { - struct sfe_ipv6_connection *c = cm->connection; - DEBUG_TRACE("flush on source interface check failure\n"); - spin_lock_bh(&si->lock); - ret = sfe_ipv6_remove_connection(si, c); - spin_unlock_bh(&si->lock); - - if (ret) { - sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); - } - } - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_SRC_IFACE); - DEBUG_TRACE("exception the packet on source interface check failure\n"); - return 0; - } - - /* - * If our packet has been marked as "sync on find" we can't actually - * forward it in the fast path, but now that we've found an associated - * connection we need sync its status before exception it to slow path. - */ - if (unlikely(sync_on_find)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT); - DEBUG_TRACE("Sync on find\n"); - return 0; - } - -#ifdef CONFIG_XFRM - /* - * We can't accelerate the flow on this direction, just let it go - * through the slow path. - */ - if (unlikely(!cm->flow_accel)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - return 0; - } -#endif - - bridge_flow = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_BRIDGE_FLOW); - - /* - * Does our hop_limit allow forwarding? - */ - if (likely(!bridge_flow)) { - if (unlikely(iph->hop_limit < 2)) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL); - DEBUG_TRACE("hop_limit too low\n"); - return 0; - } - } - - /* - * If our packet is larger than the MTU of the transmit interface then - * we can't forward it easily. - */ - if (unlikely((len > cm->xmit_dev_mtu) && (!cm->up))) { - sfe_ipv6_sync_status(si, cm->connection, SFE_SYNC_REASON_STATS); - rcu_read_unlock(); - - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION); - DEBUG_TRACE("Larger than MTU\n"); - return 0; - } - - /* - * Check if skb was cloned. If it was, unshare it. Because - * the data area is going to be written in this path and we don't want to - * change the cloned skb's data section. - */ - if (unlikely(skb_cloned(skb))) { - DEBUG_TRACE("%px: skb is a cloned skb\n", skb); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - DEBUG_WARN("Failed to unshare the cloned skb\n"); - rcu_read_unlock(); - return 0; - } - - /* - * Update the iph and udph pointers with the unshared skb's data area. - */ - iph = (struct ipv6hdr *)skb->data; - udph = (struct udphdr *)(skb->data + ihl); - } - - /* - * Check if skb has enough headroom to write L2 headers - */ - if (unlikely(skb_headroom(skb) < cm->l2_hdr_size)) { - rcu_read_unlock(); - DEBUG_WARN("%px: Not enough headroom: %u\n", skb, skb_headroom(skb)); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_NO_HEADROOM); - return 0; - } - - /* - * For PPPoE packets, match server MAC and session id - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_DECAP)) { - struct ethhdr *eth; - bool pppoe_match; - - if (unlikely(!sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - rcu_read_unlock(); - DEBUG_TRACE("%px: PPPoE header not present in packet for PPPoE rule\n", skb); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INCORRECT_PPPOE_PARSING); - return 0; - } - - eth = eth_hdr(skb); - - pppoe_match = (cm->pppoe_session_id == sfe_l2_pppoe_session_id_get(l2_info)) && - ether_addr_equal((u8*)cm->pppoe_remote_mac, (u8 *)eth->h_source); - - if (unlikely(!pppoe_match)) { - DEBUG_TRACE("%px: PPPoE sessions ID %d and %d or MAC %pM and %pM did not match\n", - skb, cm->pppoe_session_id, sfe_l2_pppoe_session_id_get(l2_info), - cm->pppoe_remote_mac, eth->h_source); - rcu_read_unlock(); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_INVALID_PPPOE_SESSION); - return 0; - } - - skb->protocol = htons(l2_info->protocol); - this_cpu_inc(si->stats_pcpu->pppoe_decap_packets_forwarded64); - } else if (unlikely(sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS))) { - - /* - * If packet contains PPPoE header but CME doesn't contain PPPoE flag yet we are exceptioning the packet to linux - */ - if (unlikely(!bridge_flow)) { - rcu_read_unlock(); - DEBUG_TRACE("%px: CME doesn't contain PPPoE flag but packet has PPPoE header\n", skb); - sfe_ipv6_exception_stats_inc(si, SFE_IPV6_EXCEPTION_EVENT_PPPOE_NOT_SET_IN_CME); - return 0; - } - - /* - * For bridged flows when packet contains PPPoE header, restore the header back and forward to xmit interface - */ - __skb_push(skb, PPPOE_SES_HLEN); - this_cpu_inc(si->stats_pcpu->pppoe_bridge_packets_forwarded64); - } - - /* - * From this point on we're good to modify the packet. - */ - - /* - * For PPPoE flows, add PPPoE header before L2 header is added. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PPPOE_ENCAP)) { - sfe_pppoe_add_header(skb, cm->pppoe_session_id, PPP_IPV6); - this_cpu_inc(si->stats_pcpu->pppoe_encap_packets_forwarded64); - } - - /* - * UDP sock will be valid only in decap-path. - * Call encap_rcv function associated with udp_sock in cm. - */ - if (unlikely(cm->up)) { - - /* - * Call decap handler associated with sock. - * Also validates UDP checksum before calling decap handler. - */ - ret = sfe_ipv6_udp_sk_deliver(skb, cm, ihl); - if (unlikely(ret == -1)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_dropped64); - return 1; - } else if (unlikely(ret == 1)) { - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_not_forwarded64); - return 0; - } - - /* - * Update traffic stats - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - rcu_read_unlock(); - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - DEBUG_TRACE("%p: sfe: sfe_ipv4_recv_udp -> encap_rcv done.\n", skb); - return 1; - } - - /* - * Update DSCP - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { - sfe_ipv6_change_dsfield(iph, cm->dscp); - } - - /* - * Decrement our hop_limit. - */ - if (likely(!bridge_flow)) { - iph->hop_limit -= (u8)!tun_outer; - } - - /* - * Enable HW csum if rx checksum is verified and xmit interface is CSUM offload capable. - * Note: If L4 csum at Rx was found to be incorrect, we (router) should use incremental L4 checksum here - * so that HW does not re-calculate/replace the L4 csum - */ - hw_csum = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_CSUM_OFFLOAD) && (skb->ip_summed == CHECKSUM_UNNECESSARY); - - /* - * Do we have to perform translations of the source address/port? - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { - u16 udp_csum; - - iph->saddr.s6_addr32[0] = cm->xlate_src_ip[0].addr[0]; - iph->saddr.s6_addr32[1] = cm->xlate_src_ip[0].addr[1]; - iph->saddr.s6_addr32[2] = cm->xlate_src_ip[0].addr[2]; - iph->saddr.s6_addr32[3] = cm->xlate_src_ip[0].addr[3]; - udph->source = cm->xlate_src_port; - - /* - * Do we have a non-zero UDP checksum? If we do then we need - * to update it. - */ - if (unlikely(!hw_csum)) { - udp_csum = udph->check; - if (likely(udp_csum)) { - u32 sum = udp_csum + cm->xlate_src_csum_adjustment; - sum = (sum & 0xffff) + (sum >> 16); - udph->check = (u16)sum; - } - } - } - - /* - * Do we have to perform translations of the destination address/port? - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { - u16 udp_csum; - - iph->daddr.s6_addr32[0] = cm->xlate_dest_ip[0].addr[0]; - iph->daddr.s6_addr32[1] = cm->xlate_dest_ip[0].addr[1]; - iph->daddr.s6_addr32[2] = cm->xlate_dest_ip[0].addr[2]; - iph->daddr.s6_addr32[3] = cm->xlate_dest_ip[0].addr[3]; - udph->dest = cm->xlate_dest_port; - - /* - * Do we have a non-zero UDP checksum? If we do then we need - * to update it. - */ - if (unlikely(!hw_csum)) { - udp_csum = udph->check; - if (likely(udp_csum)) { - u32 sum = udp_csum + cm->xlate_dest_csum_adjustment; - sum = (sum & 0xffff) + (sum >> 16); - udph->check = (u16)sum; - } - } - } - - /* - * If HW checksum offload is not possible, incremental L4 checksum is used to update the packet. - * Setting ip_summed to CHECKSUM_UNNECESSARY ensures checksum is not recalculated further in packet - * path. - */ - if (likely(hw_csum)) { - skb->ip_summed = CHECKSUM_PARTIAL; - } - - /* - * Update traffic stats. - */ - atomic_inc(&cm->rx_packet_count); - atomic_add(len, &cm->rx_byte_count); - - xmit_dev = cm->xmit_dev; - skb->dev = xmit_dev; - - /* - * Check to see if we need to add VLAN tags - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_INSERT_EGRESS_VLAN_TAG)) { - sfe_vlan_add_tag(skb, cm->egress_vlan_hdr_cnt, cm->egress_vlan_hdr); - } - - /* - * Check to see if we need to write an Ethernet header. - */ - if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { - dev_hard_header(skb, xmit_dev, ntohs(skb->protocol), - cm->xmit_dest_mac, cm->xmit_src_mac, len); - } else { - /* - * For the simple case we write this really fast. - */ - struct ethhdr *eth = (struct ethhdr *)__skb_push(skb, ETH_HLEN); - eth->h_proto = skb->protocol; - ether_addr_copy((u8 *)eth->h_dest, (u8 *)cm->xmit_dest_mac); - ether_addr_copy((u8 *)eth->h_source, (u8 *)cm->xmit_src_mac); - } - } - - /* - * Update priority of skb. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { - skb->priority = cm->priority; - } - - /* - * Mark outgoing packet. - */ - if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_MARK)) { - skb->mark = cm->mark; - /* - * Update service class stats if SAWF is valid. - */ - if (likely(cm->sawf_valid)) { - service_class_id = SFE_GET_SAWF_SERVICE_CLASS(cm->mark); - sfe_ipv6_service_class_stats_inc(si, service_class_id, len); - } - } - - /* - * For the first packets, check if it could got fast xmit. - */ - if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED) - && (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_DEV_ADMISSION))){ - cm->features = netif_skb_features(skb); - if (likely(sfe_fast_xmit_check(skb, cm->features))) { - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT; - } - cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT_FLOW_CHECKED; - } - features = cm->features; - - fast_xmit = !!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_FAST_XMIT); - - rcu_read_unlock(); - - this_cpu_inc(si->stats_pcpu->packets_forwarded64); - - /* - * We're going to check for GSO flags when we transmit the packet so - * start fetching the necessary cache line now. - */ - prefetch(skb_shinfo(skb)); - - /* - * We do per packet condition check before we could fast xmit the - * packet. - */ - if (likely(fast_xmit && dev_fast_xmit(skb, xmit_dev, features))) { - this_cpu_inc(si->stats_pcpu->packets_fast_xmited64); - return 1; - } - - /* - * Mark that this packet has been fast forwarded. - */ - skb->fast_forwarded = 1; - - /* - * Send the packet on its way. - */ - dev_queue_xmit(skb); - - return 1; -} diff --git a/shortcut-fe/sfe_ipv6_udp.h b/shortcut-fe/sfe_ipv6_udp.h deleted file mode 100644 index 6d6c543ee..000000000 --- a/shortcut-fe/sfe_ipv6_udp.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * sfe_ipv6_udp.h - * Shortcut forwarding engine header file for IPv6 UDP - * - * Copyright (c) 2015-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -int sfe_ipv6_recv_udp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, - unsigned int len, struct ipv6hdr *iph, unsigned int ihl, bool sync_on_find, - struct sfe_l2_info *l2_info, bool tun_outer); diff --git a/shortcut-fe/sfe_pppoe.c b/shortcut-fe/sfe_pppoe.c deleted file mode 100644 index 0944a8725..000000000 --- a/shortcut-fe/sfe_pppoe.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * sfe_pppoe.c - * API for shortcut forwarding engine PPPoE flows - * - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include - -#include "sfe_debug.h" -#include "sfe_api.h" -#include "sfe.h" -#include "sfe_pppoe.h" - -/* - * sfe_pppoe_br_accel_mode controls how to accelerate PPPoE bridge flow. - * - SFE_PPPOE_BR_ACCEL_MODE_EN_5T: 5-tuple (src_ip, dest_ip, src_port, dest_port, protocol) acceleration - * - SFE_PPPOE_BR_ACCEL_MODE_EN_3T: 3-tuple (src_ip, dest_ip, PPPoE session id) acceleration - * - SFE_PPPOE_BR_ACCEL_MODE_DISABLED: No acceleration - */ -static sfe_pppoe_br_accel_mode_t sfe_pppoe_br_accel_mode __read_mostly = SFE_PPPOE_BR_ACCEL_MODE_EN_5T; - -/* - * sfe_pppoe_get_br_accel_mode() - * Gets PPPoE bridge acceleration mode - */ -sfe_pppoe_br_accel_mode_t sfe_pppoe_get_br_accel_mode(void) -{ - return sfe_pppoe_br_accel_mode; -} -EXPORT_SYMBOL(sfe_pppoe_get_br_accel_mode); - -/* - * sfe_pppoe_set_br_accel_mode() - * Sets PPPoE bridge acceleration mode - */ -int sfe_pppoe_set_br_accel_mode(sfe_pppoe_br_accel_mode_t mode) -{ - if (mode >= SFE_PPPOE_BR_ACCEL_MODE_MAX) { - return -1; - } - - sfe_pppoe_br_accel_mode = mode; - return 0; -} - -/* - * sfe_pppoe_add_header() - * Add PPPoE header. - * - * skb->data will point to PPPoE header after the function - */ -void sfe_pppoe_add_header(struct sk_buff *skb, u16 pppoe_session_id, u16 ppp_protocol) -{ - struct pppoe_hdr *ph; - unsigned char *pp; - unsigned int data_len; - - /* - * Insert the PPP header protocol - */ - pp = __skb_push(skb, 2); - put_unaligned_be16(ppp_protocol, pp); - - data_len = skb->len; - - ph = (struct pppoe_hdr *)__skb_push(skb, sizeof(*ph)); - skb_reset_network_header(skb); - - /* - * Headers in skb will look like in below sequence - * | PPPoE hdr(6 bytes) | PPP hdr (2 bytes) | L3 hdr | - * - * The length field in the PPPoE header indicates the length of the PPPoE payload which - * consists of a 2-byte PPP header plus a skb->len. - */ - ph->ver = 1; - ph->type = 1; - ph->code = 0; - ph->sid = htons(pppoe_session_id); - ph->length = htons(data_len); - skb->protocol = htons(ETH_P_PPP_SES); -} - -/* - * sfe_pppoe_parse_hdr() - * Parse PPPoE header - * - * Returns true if the packet is good for further processing. - */ -bool sfe_pppoe_parse_hdr(struct sk_buff *skb, struct sfe_l2_info *l2_info) -{ - unsigned int len; - int pppoe_len; - struct sfe_ppp_hdr *ppp; - struct pppoe_hdr *ph = pppoe_hdr(skb); - - /* - * Check that we have space for PPPoE header here. - */ - if (unlikely(!pskb_may_pull(skb, (sizeof(struct pppoe_hdr) + sizeof(struct sfe_ppp_hdr))))) { - DEBUG_TRACE("%px: packet too short for PPPoE header\n", skb); - return false; - } - - len = skb->len; - pppoe_len = ntohs(ph->length); - if (unlikely(len < pppoe_len)) { - DEBUG_TRACE("%px: len: %u is too short to %u\n", skb, len, pppoe_len); - return false; - } - - ppp = (struct sfe_ppp_hdr *)((u8*)ph + sizeof(*ph)); - - /* - * Converting PPP protocol values to ether type protocol values - */ - switch(ntohs(ppp->protocol)) { - case PPP_IP: - sfe_l2_protocol_set(l2_info, ETH_P_IP); - break; - - case PPP_IPV6: - sfe_l2_protocol_set(l2_info, ETH_P_IPV6); - break; - - case PPP_LCP: - DEBUG_TRACE("%px: LCP packets are not supported in SFE\n", skb); - return false; - - default: - DEBUG_TRACE("%px: Unsupported protocol : %d in PPP header\n", skb, ntohs(ppp->protocol)); - return false; - } - - sfe_l2_parse_flag_set(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS); - sfe_l2_pppoe_session_id_set(l2_info, ntohs(ph->sid)); - - /* - * strip PPPoE header - */ - __skb_pull(skb, (sizeof(struct pppoe_hdr) + sizeof(struct sfe_ppp_hdr))); - skb_reset_network_header(skb); - - return true; -} - -/* - * sfe_pppoe_undo_parse() - * undo changes done to skb during PPPoE parsing - */ -void sfe_pppoe_undo_parse(struct sk_buff *skb, struct sfe_l2_info *l2_info) -{ - if (sfe_l2_parse_flag_check(l2_info, SFE_L2_PARSE_FLAGS_PPPOE_INGRESS)) { - __skb_push(skb, (sizeof(struct pppoe_hdr) + sizeof(struct sfe_ppp_hdr))); - } -} diff --git a/shortcut-fe/sfe_pppoe.h b/shortcut-fe/sfe_pppoe.h deleted file mode 100644 index 8329e30e2..000000000 --- a/shortcut-fe/sfe_pppoe.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * sfe_pppoe.h - * Shortcut flow acceleration for PPPoE flow - * - * Copyright (c) 2021-2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __SFE_PPPOE_H -#define __SFE_PPPOE_H - -#include -#include - -struct sfe_ppp_hdr { - u16 protocol; -}; - -void sfe_pppoe_add_header(struct sk_buff *skb, u16 pppoe_session_id, u16 ppp_protocol); -bool sfe_pppoe_parse_hdr(struct sk_buff *skb, struct sfe_l2_info *l2_info); -void sfe_pppoe_undo_parse(struct sk_buff *skb, struct sfe_l2_info *l2_info); -int sfe_pppoe_set_br_accel_mode(sfe_pppoe_br_accel_mode_t mode); - -#endif /* __SFE_PPPOE_H */ diff --git a/shortcut-fe/sfe_pppoe_mgr.c b/shortcut-fe/sfe_pppoe_mgr.c deleted file mode 100644 index 642296671..000000000 --- a/shortcut-fe/sfe_pppoe_mgr.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (c) 2022, Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "sfe_pppoe_mgr.h" -#include "sfe_debug.h" - -#define HASH_BUCKET_SIZE 2 /* ( 2^ HASH_BUCKET_SIZE ) == 4 */ - -static DEFINE_HASHTABLE(pppoe_session_table, HASH_BUCKET_SIZE); - -/* - * sfe_pppoe_mgr_get_session_info() - * Retrieve PPPoE session info associated with this netdevice - */ -static bool sfe_pppoe_mgr_get_session_info(struct net_device *dev, struct pppoe_opt *addressing) -{ - struct ppp_channel *channel[1] = {NULL}; - int px_proto; - int ppp_ch_count; - - if (ppp_is_multilink(dev)) { - DEBUG_WARN("%s: channel is multilink PPP\n", dev->name); - return false; - } - - ppp_ch_count = ppp_hold_channels(dev, channel, 1); - DEBUG_INFO("%s: PPP hold channel ret %d\n", dev->name, ppp_ch_count); - if (ppp_ch_count != 1) { - DEBUG_WARN("%s: hold channel for netdevice %px failed\n", dev->name, dev); - return false; - } - - px_proto = ppp_channel_get_protocol(channel[0]); - if (px_proto != PX_PROTO_OE) { - DEBUG_WARN("%s: session socket is not of type PX_PROTO_OE\n", dev->name); - ppp_release_channels(channel, 1); - return false; - } - - if (pppoe_channel_addressing_get(channel[0], addressing)) { - DEBUG_WARN("%s: failed to get addressing information\n", dev->name); - ppp_release_channels(channel, 1); - return false; - } - - DEBUG_TRACE("dev=%px %s %d: opt_dev=%px opt_dev_name=%s opt_dev_ifindex=%d opt_ifindex=%d\n", - dev, dev->name, dev->ifindex, - addressing->dev, addressing->dev->name, addressing->dev->ifindex, addressing->ifindex); - - /* - * pppoe_channel_addressing_get returns held device. - * So, put it back here. - */ - dev_put(addressing->dev); - ppp_release_channels(channel, 1); - return true; -} - -/* - * sfe_pppoe_mgr_remove_session() - * Remove PPPoE session entry from hash table - */ -static void sfe_pppoe_mgr_remove_session(struct sfe_pppoe_mgr_session_entry *entry) -{ - struct sfe_pppoe_mgr_session_info *info; - info = &entry->info; - - DEBUG_INFO("%px %s %d: Remove PPPoE session entry with session_id=%u server_mac=%pM\n", - entry, entry->dev->name, entry->dev->ifindex, - info->session_id, info->server_mac); - - hash_del_rcu(&entry->hash_list); - synchronize_rcu(); - kfree(entry); -} - -/* - * sfe_pppoe_mgr_add_session() - * Create a PPPoE session entry and add it into hash table - */ -static struct sfe_pppoe_mgr_session_entry *sfe_pppoe_mgr_add_session(struct net_device *dev, struct pppoe_opt *opt) - -{ - struct sfe_pppoe_mgr_session_entry *entry; - struct sfe_pppoe_mgr_session_info *info; - - entry = kzalloc(sizeof(struct sfe_pppoe_mgr_session_entry), GFP_KERNEL); - if (!entry) { - DEBUG_WARN("%px: failed to allocate pppoe session entry\n", dev); - return NULL; - } - - info = &entry->info; - - /* - * Save session info - */ - info->session_id = (uint16_t)ntohs((uint16_t)opt->pa.sid); - ether_addr_copy(info->server_mac, opt->pa.remote); - - entry->dev = dev; - - /* - * There is no need for protecting simultaneous addition & - * deletion of pppoe sesion entry as the PPP notifier chain - * call back is called with mutex lock. - */ - hash_add_rcu(pppoe_session_table, - &entry->hash_list, - dev->ifindex); - - DEBUG_INFO("%px %s %d: Add PPPoE session entry with session_id=%u server_mac=%pM\n", - entry, dev->name, dev->ifindex, - info->session_id, info->server_mac); - - return entry; -} - -/* - * sfe_pppoe_mgr_disconnect() - * PPPoE interface's disconnect event handler - */ -static int sfe_pppoe_mgr_disconnect(struct net_device *dev) -{ - struct sfe_pppoe_mgr_session_entry *entry; - struct sfe_pppoe_mgr_session_entry *found = NULL; - struct hlist_node *temp; - /* - * check whether the interface is of type PPP - */ - if (dev->type != ARPHRD_PPP || !(dev->flags & IFF_POINTOPOINT)) { - return NOTIFY_DONE; - } - - hash_for_each_possible_safe(pppoe_session_table, entry, - temp, hash_list, dev->ifindex) { - if (entry->dev != dev) { - continue; - } - - /* - * In the hash list, there must be only one entry match with this net device. - */ - found = entry; - break; - } - - if (!found) { - DEBUG_WARN("%px: PPPoE session is not found for %s\n", dev, dev->name); - return NOTIFY_DONE; - } - - /* - * Remove entry from hash table - */ - sfe_pppoe_mgr_remove_session(found); - - return NOTIFY_DONE; -} - -/* - * sfe_pppoe_mgr_connect() - * PPPoE interface's connect event handler - */ -static int sfe_pppoe_mgr_connect(struct net_device *dev) -{ - struct pppoe_opt opt; - struct sfe_pppoe_mgr_session_entry *entry; - - /* - * check whether the interface is of type PPP - */ - if (dev->type != ARPHRD_PPP || !(dev->flags & IFF_POINTOPOINT)) { - return NOTIFY_DONE; - } - - if (sfe_pppoe_mgr_get_session_info(dev, &opt) == false) { - DEBUG_WARN("%px: Unable to get pppoe session info from %s\n", dev, dev->name); - return NOTIFY_DONE; - } - - /* - * Create an session entry and add it to hash table - */ - entry = sfe_pppoe_mgr_add_session(dev, &opt); - if (!entry) { - DEBUG_WARN("%s: PPPoE session add failed\n", dev->name); - } - - return NOTIFY_DONE; -} - -/* - * sfe_pppoe_mgr_channel_notifier_handler() - * PPPoE channel notifier handler. - */ -static int sfe_pppoe_mgr_channel_notifier_handler(struct notifier_block *nb, - unsigned long event, - void *arg) -{ - struct net_device *dev = (struct net_device *)arg; - - switch (event) { - case PPP_CHANNEL_CONNECT: - DEBUG_INFO("%s: PPP_CHANNEL_CONNECT event\n", dev->name); - return sfe_pppoe_mgr_connect(dev); - - case PPP_CHANNEL_DISCONNECT: - DEBUG_INFO("%s: PPP_CHANNEL_DISCONNECT event\n", dev->name); - return sfe_pppoe_mgr_disconnect(dev); - - default: - DEBUG_INFO("%s: Unhandled channel event: %lu\n", dev->name, event); - break; - } - - return NOTIFY_DONE; -} - -struct notifier_block sfe_pppoe_mgr_channel_notifier_nb = { - .notifier_call = sfe_pppoe_mgr_channel_notifier_handler, -}; - -/* - * sfe_pppoe_mgr_find_session() - * Find pppoe session entry given session ID and server MAC - */ -bool sfe_pppoe_mgr_find_session(uint16_t session_id, uint8_t *server_mac) -{ - struct sfe_pppoe_mgr_session_entry *entry; - struct sfe_pppoe_mgr_session_info *info; - struct hlist_node *temp; - int bkt; - - hash_for_each_safe(pppoe_session_table, bkt, temp, entry, hash_list) { - info = &entry->info; - if ((uint16_t)info->session_id == session_id && - ether_addr_equal(info->server_mac, server_mac)) { - - return true; - } - } - - DEBUG_INFO("PPPoE session entry not found: session_id %d server_mac %pM\n", session_id, server_mac); - - return false; -} - -/* - * sfe_pppoe_mgr_exit - * PPPoE mgr exit function - */ -void sfe_pppoe_mgr_exit(void) -{ - struct sfe_pppoe_mgr_session_entry *entry; - struct hlist_node *temp; - int bkt; - - /* - * Unregister the module from the PPP channel events. - */ - ppp_channel_connection_unregister_notify(&sfe_pppoe_mgr_channel_notifier_nb); - - hash_for_each_safe(pppoe_session_table, bkt, temp, entry, hash_list) { - sfe_pppoe_mgr_remove_session(entry); - } -} - -/* - * sfe_pppoe_mgr_init() - * PPPoE mgr init function - */ -int sfe_pppoe_mgr_init(void) -{ - /* - * Register the module to the PPP channel events. - */ - ppp_channel_connection_register_notify(&sfe_pppoe_mgr_channel_notifier_nb); - return 0; -} diff --git a/shortcut-fe/sfe_pppoe_mgr.h b/shortcut-fe/sfe_pppoe_mgr.h deleted file mode 100644 index f90966048..000000000 --- a/shortcut-fe/sfe_pppoe_mgr.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2022, Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* - * sfe_pppoe_mgr.h - * SFE PPPoE mgr definitions - */ - -#ifndef _SFE_PPPOE_MGR_H_ -#define _SFE_PPPOE_MGR_H_ - -/* - * struct sfe_pppoe_mgr_session_info - * Structure for PPPoE client driver session info - */ -struct sfe_pppoe_mgr_session_info { - uint32_t session_id; /* PPPoE Session ID */ - uint8_t server_mac[ETH_ALEN]; /* PPPoE server's MAC address */ -}; - -/* - * struct sfe_pppoe_mgr_session_entry - * Structure for PPPoE session entry into HASH table - */ -struct sfe_pppoe_mgr_session_entry { - struct sfe_pppoe_mgr_session_info info; - /* Session information */ - struct net_device *dev; /* Net device */ - struct hlist_node hash_list; /* Hash list for sessions */ -}; - -bool sfe_pppoe_mgr_find_session(uint16_t session_id, uint8_t *server_mac); -int sfe_pppoe_mgr_init(void); -void sfe_pppoe_mgr_exit(void); - -#endif diff --git a/shortcut-fe/sfe_vlan.h b/shortcut-fe/sfe_vlan.h deleted file mode 100644 index 263b69a8c..000000000 --- a/shortcut-fe/sfe_vlan.h +++ /dev/null @@ -1,218 +0,0 @@ -/* - * sfe_vlan.h - * Shortcut flow acceleration for 802.1AD/802.1Q flow - * - * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __SFE_VLAN_H -#define __SFE_VLAN_H - -#include - -/* - * sfe_vlan_check_and_parse_tag() - * - * case 1: QinQ frame (e.g. outer tag = 88a80032, inner tag = 81000001): - * When entering this function: - * ----+-----------------+-----|-----+-----------+-----+--------- - * |DMAC |SMAC |88|a8|00|32|81|00|00|01|08|00|45|00| - * ----+-----------------+-----A-----+-----------+-----+--------- - * skb->data - * skb->protocol = ntohs(ETH_P_8021AD) - * skb->vlan_proto = 0 - * skb->vlan_tci = 0 - * skb->vlan_present = 0 - * When exiting: - * ----+-----------------+-----------+-----------+-----+--------- - * |DMAC |SMAC |88|a8|00|32|81|00|00|01|08|00|45|00| - * ----+-----------------+-----------+-----------+-----A--------- - * skb->data - * skb->protocol = ntohs(ETH_P_IP) - * skb->vlan_proto = 0 - * skb->vlan_tci = 0 - * skb->vlan_present = 0 - * l2_info->vlan_hdr_cnt = 2 - * l2_info->vlan_hdr[0].tpid = ntohs(ETH_P_8021AD) - * l2_info->vlan_hdr[0].tci = 0x0032 - * l2_info->vlan_hdr[1].tpid = ntohs(ETH_P_8021Q) - * l2_info->vlan_hdr[1].tci = 0x0001 - * l2_info->protocol = ETH_P_IP - * - * case 2: 802.1Q frame (e.g. the tag is 81000001): - * When entering this function: - * ----+-----------------+-----|-----+-----+--------- - * |DMAC |SMAC |81|00|00|01|08|00|45|00| - * ----+-----------------+-----A-----+-----+--------- - * skb->data - * skb->protocol = ntohs(ETH_P_8021Q) - * skb->vlan_proto = 0 - * skb->vlan_tci = 0 - * skb->vlan_present = 0 - * When exiting: - * ----+-----------------+-----------+-----+--------- - * |DMAC |SMAC |81|00|00|01|08|00|45|00| - * ----+-----------------+-----------+-----A--------- - * skb->data - * skb->protocol = ntohs(ETH_P_IP) - * skb->vlan_proto = 0 - * skb->vlan_tci = 0 - * skb->vlan_present = 0 - * l2_info->vlan_hdr_cnt = 1 - * l2_info->vlan_hdr[0].tpid = ntohs(ETH_P_8021Q) - * l2_info->vlan_hdr[0].tci = 0x0001 - * l2_info->protocol = ETH_P_IP - * - * case 3: untagged frame - * When entering this function: - * ----+-----------------+-----|--------------------- - * |DMAC |SMAC |08|00|45|00| - * ----+-----------------+-----A--------------------- - * skb->data - * skb->protocol = ntohs(ETH_P_IP) - * skb->vlan_proto = 0 - * skb->vlan_tci = 0 - * skb->vlan_present = 0 - * When exiting: - * ----+-----------------+-----|--------------------- - * |DMAC |SMAC |08|00|45|00| - * ----+-----------------+-----A--------------------- - * skb->data - * skb->protocol = ntohs(ETH_P_IP) - * skb->vlan_proto = 0 - * skb->vlan_tci = 0 - * skb->vlan_present = 0 - * l2_info->vlan_hdr_cnt = 0 - * l2_info->protocol = ETH_P_IP - */ -static inline bool sfe_vlan_check_and_parse_tag(struct sk_buff *skb, struct sfe_l2_info *l2_info) -{ - struct vlan_hdr *vhdr; - - while ((skb->protocol == htons(ETH_P_8021AD) || skb->protocol == htons(ETH_P_8021Q)) && - l2_info->vlan_hdr_cnt < SFE_MAX_VLAN_DEPTH) { - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) { - return false; - } - vhdr = (struct vlan_hdr *)skb->data; - l2_info->vlan_hdr[l2_info->vlan_hdr_cnt].tpid = skb->protocol; - l2_info->vlan_hdr[l2_info->vlan_hdr_cnt].tci = ntohs(vhdr->h_vlan_TCI); - skb->protocol = vhdr->h_vlan_encapsulated_proto; - l2_info->vlan_hdr_cnt++; - /* - * strip VLAN header - */ - __skb_pull(skb, VLAN_HLEN); - skb_reset_network_header(skb); - } - - l2_info->protocol = htons(skb->protocol); - return true; -} - -/* - * sfe_vlan_undo_parse() - * Restore some skb fields which are modified when parsing VLAN tags. - */ -static inline void sfe_vlan_undo_parse(struct sk_buff *skb, struct sfe_l2_info *l2_info) -{ - if (l2_info->vlan_hdr_cnt == 0) { - return; - } - - skb->protocol = l2_info->vlan_hdr[0].tpid; - __skb_push(skb, l2_info->vlan_hdr_cnt * VLAN_HLEN); -} - -/* - * sfe_vlan_validate_ingress_tag() - * Validate ingress packet's VLAN tag - */ -static inline bool sfe_vlan_validate_ingress_tag( - struct sk_buff *skb, u8 count, struct sfe_vlan_hdr *vlan_hdr, struct sfe_l2_info *l2_info) -{ - u8 i; - - if (likely(!sfe_is_l2_feature_enabled())) { - return true; - } - - if (unlikely(count != l2_info->vlan_hdr_cnt)) { - return false; - } - - for (i = 0; i < count; i++) { - if (unlikely(vlan_hdr[i].tpid != l2_info->vlan_hdr[i].tpid)) { - return false; - } - - if (unlikely((vlan_hdr[i].tci & VLAN_VID_MASK) != - (l2_info->vlan_hdr[i].tci & VLAN_VID_MASK))) { - return false; - } - } - - return true; -} - -/* - * sfe_vlan_add_tag() - * Add VLAN tags at skb->data. - * Normally, it is called just before adding 14-byte Ethernet header. - * - * This function does not update skb->mac_header so later code - * needs to call skb_reset_mac_header()/skb_reset_mac_len() to - * get correct skb->mac_header/skb->mac_len. - * - * It assumes: - * - skb->protocol is set - * - skb has enough headroom to write VLAN tags - * - 0 < count <= SFE_MAX_VLAN_DEPTH - * - * When entering (e.g. skb->protocol = ntohs(ETH_P_IP) or ntohs(ETH_P_PPP_SES)): - * -------------------------------+--------------------- - * |45|00|... - * -------------------------------A--------------------- - * skb->data - * -------------------------------v-----------------+-----+---------- - * |11|00|xx|xx|xx|xx|00|21|45|00|... - * -------------------------------+-----------------+-----+---------- - * - * When exiting (e.g. to add outer/inner tag = 88a80032/81000001): - * -------------+-----------+-----+--------------------- - * |00|32|81|00|00|01|08|00|45|00|05|d8|.... - * -------A-----+-----------+-----+--------------------- - * skb->data - * -------v-----+-----------+-----+-----------------+-----+---------- - * |00|32|81|00|00|01|88|64|11|00|xx|xx|xx|xx|00|21|45|00| - * -------------+-----------+-----+-----------------+-----+---------- - * skb->protocol = ntohs(ETH_P_8021AD) - */ -static inline void sfe_vlan_add_tag(struct sk_buff *skb, int count, struct sfe_vlan_hdr *vlan) -{ - struct vlan_hdr *vhdr; - int i; - vlan += (count - 1); - - for (i = 0; i < count; i++) { - vhdr = (struct vlan_hdr *)skb_push(skb, VLAN_HLEN); - vhdr->h_vlan_TCI = htons(vlan->tci); - vhdr->h_vlan_encapsulated_proto = skb->protocol; - skb->protocol = vlan->tpid; - vlan--; - } -} - -#endif /* __SFE_VLAN_H */ diff --git a/shortcut-fe/shortcut-fe/Makefile b/shortcut-fe/shortcut-fe/Makefile new file mode 100644 index 000000000..dd53042e5 --- /dev/null +++ b/shortcut-fe/shortcut-fe/Makefile @@ -0,0 +1,88 @@ +# +# Copyright (c) 2013-2018, 2020 The Linux Foundation. All rights reserved. +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all copies. +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=shortcut-fe +PKG_RELEASE:=8 + +include $(INCLUDE_DIR)/package.mk + +define KernelPackage/shortcut-fe + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=@IPV6 +kmod-nf-conntrack + TITLE:=Kernel driver for SFE + FILES:= \ + $(PKG_BUILD_DIR)/shortcut-fe.ko \ + $(PKG_BUILD_DIR)/shortcut-fe-ipv6.ko + KCONFIG:= \ + CONFIG_NF_CONNTRACK_EVENTS=y \ + CONFIG_NF_CONNTRACK_TIMEOUT=y \ + CONFIG_SHORTCUT_FE=y \ + CONFIG_XFRM=y + PROVIDES:=$(PKG_NAME) + AUTOLOAD:=$(call AutoLoad,09,shortcut-fe shortcut-fe-ipv6) +endef + +define KernelPackage/shortcut-fe/Description +Shortcut is an in-Linux-kernel IP packet forwarding engine. +endef + +define KernelPackage/shortcut-fe/install + $(INSTALL_DIR) $(1)/usr/bin + $(INSTALL_BIN) ./files/usr/bin/sfe_dump $(1)/usr/bin +endef + +HAVE_ECM:=$(CONFIG_PACKAGE_kmod-qca-nss-ecm-premium)$(CONFIG_PACKAGE_kmod-qca-nss-ecm-noload)$(CONFIG_PACKAGE_kmod-qca-nss-ecm-premium-noload)$(CONFIG_PACKAGE_kmod-qca-nss-ecm-standard) + +define KernelPackage/shortcut-fe-cm + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=+kmod-ipt-conntrack +kmod-shortcut-fe + TITLE:=Kernel driver for SFE + FILES:=$(PKG_BUILD_DIR)/shortcut-fe-cm.ko + KCONFIG:= \ + CONFIG_NF_CONNTRACK_CHAIN_EVENTS=y \ + CONFIG_NF_CONNTRACK_EVENTS=y \ + CONFIG_XFRM=y + CONFLICTS:=kmod-shortcut-fe-drv +endef + +define KernelPackage/shortcut-fe-cm/Description +Simple connection manager for the Shortcut forwarding engine. +endef + +define Build/Compile + $(MAKE) $(PKG_JOBS) -C "$(LINUX_DIR)" \ + $(KERNEL_MAKE_FLAGS) \ + $(PKG_MAKE_FLAGS) \ + M="$(PKG_BUILD_DIR)" \ + EXTRA_CFLAGS+="-DSFE_SUPPORT_IPV6" SFE_SUPPORT_IPV6=y \ + $(if $(HAVE_ECM),EXTRA_CFLAGS+="-DCONFIG_SFE_ECM" CONFIG_SFE_ECM=y,) \ + modules +endef + +ifneq ($(CONFIG_PACKAGE_kmod-shortcut-fe)$(CONFIG_PACKAGE_kmod-shortcut-fe-cm),) +define Build/InstallDev + $(INSTALL_DIR) $(1)/usr/include/shortcut-fe + $(CP) -rf $(PKG_BUILD_DIR)/sfe.h $(1)/usr/include/shortcut-fe +endef +endif + +$(eval $(call KernelPackage,shortcut-fe)) +$(eval $(call KernelPackage,shortcut-fe-cm)) diff --git a/shortcut-fe/shortcut-fe/files/etc/init.d/shortcut-fe b/shortcut-fe/shortcut-fe/files/etc/init.d/shortcut-fe new file mode 100644 index 000000000..838512a36 --- /dev/null +++ b/shortcut-fe/shortcut-fe/files/etc/init.d/shortcut-fe @@ -0,0 +1,51 @@ +#!/bin/sh /etc/rc.common +# +# Copyright (c) 2014-2015 The Linux Foundation. All rights reserved. +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all copies. +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +#SFE connection manager has a lower priority, it should be started after other connection manager +#to detect the existence of connection manager with higher priority +START=70 + +have_cm() { + [ -d "/sys/kernel/debug/ecm" ] && echo 1 && return + + echo 0 +} + +#load shortcut-fe and connection manager +load_sfe() { + local kernel_version=$(uname -r) + + [ -d "/sys/module/shortcut_fe" ] || insmod /lib/modules/$kernel_version/shortcut-fe.ko + [ -d "/sys/module/shortcut_fe_ipv6" ] || insmod /lib/modules/$kernel_version/shortcut-fe-ipv6.ko + + [ -e "/lib/modules/$kernel_version/shortcut-fe-cm.ko" ] && { + [ -d /sys/module/shortcut_fe_cm ] || insmod /lib/modules/$kernel_version/shortcut-fe-cm.ko + } + [ -e "/lib/modules/$kernel_version/fast-classifier.ko" ] && { + [ -d /sys/module/fast_classifier ] || insmod /lib/modules/$kernel_version/fast-classifier.ko + } +} + +start() { + [ "$(have_cm)" = "0" ] && load_sfe +} + +stop() { + [ -d "/sys/module/shortcut_fe_cm" ] && rmmod shortcut_fe_cm + [ -d "/sys/module/shortcut_fe_ipv6" ] && rmmod shortcut_fe_ipv6 + [ -d "/sys/module/shortcut_fe" ] && rmmod shortcut_fe + [ -d "/sys/module/shortcut_fe_drv" ] && rmmod shortcut_fe_drv + [ -d "/sys/module/fast_classifier" ] && rmmod fast_classifier +} diff --git a/shortcut-fe/shortcut-fe/files/usr/bin/sfe_dump b/shortcut-fe/shortcut-fe/files/usr/bin/sfe_dump new file mode 100644 index 000000000..2a224e0ca --- /dev/null +++ b/shortcut-fe/shortcut-fe/files/usr/bin/sfe_dump @@ -0,0 +1,35 @@ +#!/bin/sh +# +# Copyright (c) 2015 The Linux Foundation. All rights reserved. +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all copies. +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +#@sfe_dump +#@example : sfe_dump (ipv4|ipv6) +sfe_dump(){ + [ -e "/dev/sfe_ipv4" ] || { + dev_num=$(cat /sys/sfe_ipv4/debug_dev) + mknod /dev/sfe_ipv4 c $dev_num 0 + } + [ -e "/dev/sfe_ipv6" ] || { + dev_num=$(cat /sys/sfe_ipv6/debug_dev) + mknod /dev/sfe_ipv6 c $dev_num 0 + } + cat /dev/sfe_$1 +} + +if [ -z "$1" ]; then + sfe_dump ipv4 + sfe_dump ipv6 +else + sfe_dump $1 +fi diff --git a/shortcut-fe/shortcut-fe/src/Kconfig b/shortcut-fe/shortcut-fe/src/Kconfig new file mode 100644 index 000000000..487f1e065 --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/Kconfig @@ -0,0 +1,14 @@ +# +# Shortcut forwarding engine +# + +config SHORTCUT_FE + tristate "Shortcut Forwarding Engine" + depends on NF_CONNTRACK + ---help--- + Shortcut is a fast in-kernel packet forwarding engine. + + To compile this code as a module, choose M here: the module will be + called shortcut-fe. + + If unsure, say N. diff --git a/shortcut-fe/shortcut-fe/src/Makefile b/shortcut-fe/shortcut-fe/src/Makefile new file mode 100644 index 000000000..991a20ec6 --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for Shortcut FE. +# + +obj-m += shortcut-fe.o + +ifdef SFE_SUPPORT_IPV6 +obj-m += shortcut-fe-ipv6.o +endif + +obj-m += shortcut-fe-cm.o + +shortcut-fe-objs := \ + sfe_ipv4.o + +ifdef SFE_SUPPORT_IPV6 +shortcut-fe-ipv6-objs := \ + sfe_ipv6.o +endif + +shortcut-fe-cm-objs := \ + sfe_cm.o + +ccflags-y += -Werror -Wall diff --git a/shortcut-fe/sfe_flow_cookie.h b/shortcut-fe/shortcut-fe/src/sfe.h similarity index 54% rename from shortcut-fe/sfe_flow_cookie.h rename to shortcut-fe/shortcut-fe/src/sfe.h index 12b34bb3a..279e7b3dc 100644 --- a/shortcut-fe/sfe_flow_cookie.h +++ b/shortcut-fe/shortcut-fe/src/sfe.h @@ -1,40 +1,75 @@ /* - * sfe_flow_cookie.h - * Flow cookie related callbacks. - * - * Copyright (c) 2013-2016, 2019-2020, The Linux Foundation. All rights reserved. - * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. + * sfe.h + * Shortcut forwarding engine. * + * Copyright (c) 2013-2017 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* + * The following are debug macros used throughout the SFE. + * + * The DEBUG_LEVEL enables the followings based on its value, + * when dynamic debug option is disabled. + * + * 0 = OFF + * 1 = ASSERTS / ERRORS + * 2 = 1 + WARN + * 3 = 2 + INFO + * 4 = 3 + TRACE + */ +#define DEBUG_LEVEL 2 + +#if (DEBUG_LEVEL < 1) +#define DEBUG_ASSERT(s, ...) +#define DEBUG_ERROR(s, ...) +#else +#define DEBUG_ASSERT(c, s, ...) if (!(c)) { pr_emerg("ASSERT: %s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__); BUG(); } +#define DEBUG_ERROR(s, ...) pr_err("%s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if defined(CONFIG_DYNAMIC_DEBUG) +/* + * Compile messages for dynamic enable/disable + */ +#define DEBUG_WARN(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define DEBUG_INFO(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define DEBUG_TRACE(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#else + +/* + * Statically compile messages at different levels + */ +#if (DEBUG_LEVEL < 2) +#define DEBUG_WARN(s, ...) +#else +#define DEBUG_WARN(s, ...) pr_warn("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if (DEBUG_LEVEL < 3) +#define DEBUG_INFO(s, ...) +#else +#define DEBUG_INFO(s, ...) pr_notice("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if (DEBUG_LEVEL < 4) +#define DEBUG_TRACE(s, ...) +#else +#define DEBUG_TRACE(s, ...) pr_info("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif +#endif + #ifdef CONFIG_NF_FLOW_COOKIE -#define SFE_FLOW_COOKIE_SIZE 2048 -#define SFE_FLOW_COOKIE_MASK 0x7ff - -struct sfe_ipv4_connection_match; -struct sfe_ipv6_connection_match; - -struct sfe_flow_cookie_entry { - struct sfe_ipv4_connection_match *match; - unsigned long last_clean_time; -}; - -struct sfe_ipv6_flow_cookie_entry { - struct sfe_ipv6_connection_match *match; - unsigned long last_clean_time; -}; - typedef int (*flow_cookie_set_func_t)(u32 protocol, __be32 src_ip, __be16 src_port, __be32 dst_ip, __be16 dst_port, u16 flow_cookie); /* diff --git a/shortcut-fe/shortcut-fe/src/sfe_backport.h b/shortcut-fe/shortcut-fe/src/sfe_backport.h new file mode 100644 index 000000000..d2d60c73c --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/sfe_backport.h @@ -0,0 +1,195 @@ +/* + * sfe_backport.h + * Shortcut forwarding engine compatible header file. + * + * Copyright (c) 2014-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)) +#include +#else +enum udp_conntrack { + UDP_CT_UNREPLIED, + UDP_CT_REPLIED, + UDP_CT_MAX +}; + +static inline unsigned int * +nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, + struct nf_conntrack_l4proto *l4proto) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; + + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + return timeouts; +#else + return l4proto->get_timeouts(net); +#endif /*CONFIG_NF_CONNTRACK_TIMEOUT*/ +} +#endif /*KERNEL_VERSION(3, 7, 0)*/ +#endif /*KERNEL_VERSION(3, 4, 0)*/ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(void *priv, \ + struct sk_buff *SKB, \ + const struct nf_hook_state *state) +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(const struct nf_hook_ops *OPS, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#else +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(unsigned int HOOKNUM, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#endif + +#define sfe_cm_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define sfe_cm_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP6_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) +#define SFE_NF_CT_DEFAULT_ZONE (&nf_ct_zone_dflt) +#else +#define SFE_NF_CT_DEFAULT_ZONE NF_CT_DEFAULT_ZONE +#endif + +/* + * sfe_dev_get_master + * get master of bridge port, and hold it + */ +static inline struct net_device *sfe_dev_get_master(struct net_device *dev) +{ + struct net_device *master; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) + rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + dev_hold(master); + + rcu_read_unlock(); +#else + master = dev->master; + if (master) + dev_hold(master); +#endif + return master; +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0)) +#define SFE_DEV_EVENT_PTR(PTR) netdev_notifier_info_to_dev(PTR) +#else +#define SFE_DEV_EVENT_PTR(PTR) (struct net_device *)(PTR) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_acct *NM +#else +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_counter *NM +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_ACCT_COUNTER(NM) ((NM)->counter) +#else +#define SFE_ACCT_COUNTER(NM) (NM) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#define sfe_hash_for_each_possible(name, obj, node, member, key) \ + hash_for_each_possible(name, obj, member, key) +#else +#define sfe_hash_for_each_possible(name, obj, node, member, key) \ + hash_for_each_possible(name, obj, node, member, key) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#define sfe_hash_for_each(name, bkt, node, obj, member) \ + hash_for_each(name, bkt, obj, member) +#else +#define sfe_hash_for_each(name, bkt, node, obj, member) \ + hash_for_each(name, bkt, node, obj, member) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) +#define sfe_dst_get_neighbour(dst, daddr) dst_neigh_lookup(dst, addr) +#else +static inline struct neighbour * +sfe_dst_get_neighbour(struct dst_entry *dst, void *daddr) +{ + struct neighbour *neigh = dst_get_neighbour_noref(dst); + + if (neigh) + neigh_hold(neigh); + + return neigh; +} +#endif diff --git a/shortcut-fe/shortcut-fe/src/sfe_cm.c b/shortcut-fe/shortcut-fe/src/sfe_cm.c new file mode 100644 index 000000000..2d3f79a04 --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/sfe_cm.c @@ -0,0 +1,1210 @@ +/* + * sfe-cm.c + * Shortcut forwarding engine connection manager. + * + * Copyright (c) 2013-2018, 2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" +#include "sfe_backport.h" + +typedef enum sfe_cm_exception { + SFE_CM_EXCEPTION_PACKET_BROADCAST, + SFE_CM_EXCEPTION_PACKET_MULTICAST, + SFE_CM_EXCEPTION_NO_IIF, + SFE_CM_EXCEPTION_NO_CT, + SFE_CM_EXCEPTION_CT_NO_TRACK, + SFE_CM_EXCEPTION_CT_NO_CONFIRM, + SFE_CM_EXCEPTION_CT_IS_ALG, + SFE_CM_EXCEPTION_IS_IPV4_MCAST, + SFE_CM_EXCEPTION_IS_IPV6_MCAST, + SFE_CM_EXCEPTION_TCP_NOT_ASSURED, + SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED, + SFE_CM_EXCEPTION_UNKNOW_PROTOCOL, + SFE_CM_EXCEPTION_NO_SRC_DEV, + SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV, + SFE_CM_EXCEPTION_NO_DEST_DEV, + SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV, + SFE_CM_EXCEPTION_NO_BRIDGE, + SFE_CM_EXCEPTION_LOCAL_OUT, + SFE_CM_EXCEPTION_MAX +} sfe_cm_exception_t; + +static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT" +}; + +/* + * Per-module structure. + */ +struct sfe_cm { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_sfe_cm; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[SFE_CM_EXCEPTION_MAX]; +}; + +static struct sfe_cm __sc; + +/* + * sfe_cm_incr_exceptions() + * increase an exception counter. + */ +static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except) +{ + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * sfe_cm_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_cm_recv(struct sk_buff *skb) +{ + struct net_device *dev; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv4_recv(dev, skb); + } + + if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv6_recv(dev, skb); + } + + DEBUG_TRACE("not IP packet\n"); + return 0; +} + +/* + * sfe_cm_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4/IPv6 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool sfe_cm_find_dev_and_mac_addr(struct sk_buff *skb, sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * If we have skb provided, use it as the original code is unable + * to lookup routes that are policy routed. + */ + if (unlikely(skb)) { + dst = skb_dst(skb); + goto skip_dst_lookup; + } + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this work when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)) + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); +#else + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, NULL, 0); +#endif + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + +skip_dst_lookup: + rcu_read_lock(); + neigh = sfe_dst_get_neighbour(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + if (likely(!skb)) + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + if (likely(!skb)) + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + if (likely(!skb)) + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + if (likely(!skb)) + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6); + } + + return false; +} + +/* + * sfe_cm_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4) +{ + struct sfe_connection_create sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_dev_tmp; + struct net_device *dest_dev_tmp; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + struct sk_buff *tmp_skb = NULL; + SFE_NF_CONN_ACCT(acct); + + #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + struct net *net=NULL; + struct nf_tcp_net *tn=NULL; + #endif + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + /* + * Packet to xfrm for encapsulation, we can't process it + */ + if (unlikely(skb_dst(skb)->xfrm)) { + DEBUG_TRACE("packet to xfrm, ignoring\n"); + return NF_ACCEPT; + } +#endif + + /* + * Don't process locally generated packets. + */ + if (skb->sk) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT); + DEBUG_TRACE("skip local out packet\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + if (unlikely(nf_ct_is_untracked(ct))) { +#else + if (unlikely(ctinfo == IP_CT_UNTRACKED)) { +#endif + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + /* + * Check if the acceleration of a flow could be rejected quickly. + */ + acct = nf_conn_acct_find(ct); + if (acct) { + long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets); + if ((packets > 0xff) && (packets & 0xff)) { + /* + * Connection hits slow path at least 256 times, so it must be not able to accelerate. + * But we also give it a chance to walk through ECM every 256 packets + */ + return NF_ACCEPT; + } + } + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + sic.src_td_end = ct->proto.tcp.seen[0].td_end; + sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + sic.dest_td_end = ct->proto.tcp.seen[1].td_end; + sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + net = nf_ct_net(ct); + tn = nf_tcp_pernet(net); + if ((tn&&tn->tcp_no_window_check) +#else + if (nf_ct_tcp_no_window_check +#endif + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock_bh(&ct->lock); + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port), + &sic.dest_ip, ntohs(sic.dest_port)); + return NF_ACCEPT; + } + spin_unlock_bh(&ct->lock); + + /* + * Somehow, SFE is not playing nice with IPSec traffic. + * Do not accelerate for now. + */ + if (ntohs(sic.dest_port) == 4500 || ntohs(sic.dest_port) == 500) { + if (likely(is_v4)) + DEBUG_TRACE("IPsec bypass: %pI4:%d(%pI4:%d) to %pI4:%d(%pI4:%d)\n", + &sic.src_ip.ip, ntohs(sic.src_port), &sic.src_ip_xlate.ip, ntohs(sic.src_port_xlate), + &sic.dest_ip.ip, ntohs(sic.dest_port), &sic.dest_ip_xlate.ip, ntohs(sic.dest_port_xlate)); + else + DEBUG_TRACE("IPsec bypass: %pI6:%d to %pI6:%d\n", + &sic.src_ip.ip6, ntohs(sic.src_port), &sic.dest_ip.ip6, ntohs(sic.dest_port)); + return NF_ACCEPT; + } + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; + + /* + * For packets de-capsulated from xfrm, we still can accelerate it + * on the direction we just received the packet. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0)) + if (unlikely(skb->sp)) { +#else + if (unlikely(secpath_exists(skb))) { +#endif + if (sic.protocol == IPPROTO_TCP && + !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) { + return NF_ACCEPT; + } + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + sic.reply_accel = 0; + } else { + sic.original_accel = 0; + } + } +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!sfe_cm_find_dev_and_mac_addr(NULL, &sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + src_dev = src_dev_tmp; + + if (!sfe_cm_find_dev_and_mac_addr(NULL, &sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + dev_put(dev); + /* Somehow, for IPv6, we need this workaround as well */ + if (unlikely(!is_v4)) + tmp_skb = skb; + if (!sfe_cm_find_dev_and_mac_addr(tmp_skb, &sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV); + goto done1; + } + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(skb, &sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + dest_dev = dest_dev_tmp; + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (likely(is_v4)) { + sfe_ipv4_create_rule(&sic); + } else { + sfe_ipv6_create_rule(&sic); + } + + /* + * If we had bridge ports then release them too. + */ + if (dest_br_dev) { + dev_put(dest_br_dev); + } +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } +done2: + dev_put(dest_dev_tmp); +done1: + dev_put(src_dev_tmp); + + return NF_ACCEPT; +} + +/* + * sfe_cm_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, true); +} + +/* + * sfe_cm_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, false); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * sfe_cm_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int sfe_cm_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } +#endif + + /* + * We're only interested in destroy events. + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + sfe_ipv4_destroy_rule(&sid); + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + sfe_ipv6_destroy_rule(&sid); + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block sfe_cm_conntrack_notifier = { + .notifier_call = sfe_cm_conntrack_event, +}; +#else +static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = { + .fcn = sfe_cm_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook), +#ifdef SFE_SUPPORT_IPV6 + SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook), +#endif +}; + +/* + * sfe_cm_sync_rule() + * Synchronize a connection's state. + */ +static void sfe_cm_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); +#endif + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + ct->timeout.expires += sis->delta_jiffies; +#else + ct->timeout += sis->delta_jiffies; +#endif + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) + case IPPROTO_UDP: + /* + * In Linux connection track, UDP flow has two timeout values: + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout: + * this is for uni-direction UDP flow, normally its value is 60 seconds + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream: + * this is for bi-direction UDP flow, normally its value is 180 seconds + * + * Linux will update timer of UDP flow to stream timeout once it seen packets + * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't + * see any packets. So we have to do the same thing in our stats sync message. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) { + u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + + if (reply_pkts != 0) { + unsigned int *timeouts; + struct nf_conntrack_l4proto *l4proto __maybe_unused; + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + set_bit(IPS_ASSURED_BIT, &ct->status); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0)) + l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP); + timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto); + spin_lock_bh(&ct->lock); + ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED]; + spin_unlock_bh(&ct->lock); +#else + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + timeouts = nf_udp_pernet(nf_ct_net(ct))->timeouts; + } + + spin_lock_bh(&ct->lock); + ct->timeout = jiffies + timeouts[UDP_CT_REPLIED]; + spin_unlock_bh(&ct->lock); +#endif + } + } + break; +#endif /*KERNEL_VERSION(3, 4, 0)*/ + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * sfe_cm_device_event() + */ +int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet_event() + */ +static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet6_event() + */ +static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_get_exceptions + * dump exception counters + */ +static ssize_t sfe_cm_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_cm_exceptions_attr = + __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL); + +/* + * sfe_cm_init() + */ +static int __init sfe_cm_init(void) +{ + struct sfe_cm *sc = &__sc; + int result = -1; +#ifdef CONFIG_SFE_ECM + int (*fast_recv)(struct sk_buff *skb); +#endif + + DEBUG_INFO("SFE CM init\n"); + + /* + * Create sys/sfe_cm + */ + sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL); + if (!sc->sys_sfe_cm) { + DEBUG_ERROR("failed to register sfe_cm\n"); + goto exit1; + } + + /* + * Create sys/sfe_cm/exceptions + */ + result = sysfs_create_file(sc->sys_sfe_cm, &sfe_cm_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + goto exit2; + } + + sc->dev_notifier.notifier_call = sfe_cm_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = sfe_cm_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = sfe_cm_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + /* + * Register our netfilter hooks. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + result = nf_register_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + + /* + * Register a notifier hook to get fast notifications of expired connections. + * Note: In CONFIG_NF_CONNTRACK_CHAIN_EVENTS enabled case, nf_conntrack_register_notifier() + * function always returns 0. + */ +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + result = nf_conntrack_register_chain_notifier(&init_net, &sfe_cm_conntrack_notifier); +#else + result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); +#endif + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ +#ifdef CONFIG_SFE_ECM + rcu_read_lock(); + fast_recv = rcu_dereference(athrs_fast_nat_recv); + rcu_read_unlock(); + if (!fast_recv) { + BUG_ON(athrs_fast_nat_recv); + } +#else + BUG_ON(athrs_fast_nat_recv); +#endif + RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_cm_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule); + sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule); + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +exit4: +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + +#endif +#endif +exit3: + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); +exit2: + kobject_put(sc->sys_sfe_cm); + +exit1: + return result; +} + +/* + * sfe_cm_exit() + */ +static void __exit sfe_cm_exit(void) +{ + struct sfe_cm *sc = &__sc; + + DEBUG_INFO("SFE CM exit\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + nf_conntrack_unregister_chain_notifier(&init_net, &sfe_cm_conntrack_notifier); +#else + nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier); +#endif +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_sfe_cm); +} + +module_init(sfe_cm_init) +module_exit(sfe_cm_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/shortcut-fe/src/sfe_cm.h b/shortcut-fe/shortcut-fe/src/sfe_cm.h new file mode 100644 index 000000000..124c86f47 --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/sfe_cm.h @@ -0,0 +1,260 @@ +/* + * sfe_cm.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * connection flags. + */ +#define SFE_CREATE_FLAG_NO_SEQ_CHECK BIT(0) + /* Indicates that we should not check sequence numbers */ +#define SFE_CREATE_FLAG_REMARK_PRIORITY BIT(1) + /* Indicates that we should remark priority of skb */ +#define SFE_CREATE_FLAG_REMARK_DSCP BIT(2) + /* Indicates that we should remark DSCP of packet */ + +/* + * IPv6 address structure + */ +struct sfe_ipv6_addr { + __be32 addr[4]; +}; + +typedef union { + __be32 ip; + struct sfe_ipv6_addr ip6[1]; +} sfe_ip_addr_t; + +/* + * connection creation structure. + */ +struct sfe_connection_create { + int protocol; + struct net_device *src_dev; + struct net_device *dest_dev; + u32 flags; + u32 src_mtu; + u32 dest_mtu; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t src_ip_xlate; + sfe_ip_addr_t dest_ip; + sfe_ip_addr_t dest_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u8 src_mac[ETH_ALEN]; + u8 src_mac_xlate[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 dest_mac_xlate[ETH_ALEN]; + u8 src_td_window_scale; + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u8 dest_td_window_scale; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u32 mark; +#ifdef CONFIG_XFRM + u32 original_accel; + u32 reply_accel; +#endif + u32 src_priority; + u32 dest_priority; + u32 src_dscp; + u32 dest_dscp; +}; + +/* + * connection destruction structure. + */ +struct sfe_connection_destroy { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; +}; + +typedef enum sfe_sync_reason { + SFE_SYNC_REASON_STATS, /* Sync is to synchronize stats */ + SFE_SYNC_REASON_FLUSH, /* Sync is to flush a entry */ + SFE_SYNC_REASON_DESTROY /* Sync is to destroy a entry(requested by connection manager) */ +} sfe_sync_reason_t; + +/* + * Structure used to sync connection stats/state back within the system. + * + * NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing. + * 'src' is the creator of the connection. + */ +struct sfe_connection_sync { + struct net_device *src_dev; + struct net_device *dest_dev; + int is_v6; /* Is it for ipv6? */ + int protocol; /* IP protocol number (IPPROTO_...) */ + sfe_ip_addr_t src_ip; /* Non-NAT source address, i.e. the creator of the connection */ + sfe_ip_addr_t src_ip_xlate; /* NATed source address */ + __be16 src_port; /* Non-NAT source port */ + __be16 src_port_xlate; /* NATed source port */ + sfe_ip_addr_t dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */ + sfe_ip_addr_t dest_ip_xlate; /* NATed destination address */ + __be16 dest_port; /* Non-NAT destination port */ + __be16 dest_port_xlate; /* NATed destination port */ + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u64 src_packet_count; + u64 src_byte_count; + u32 src_new_packet_count; + u32 src_new_byte_count; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u64 dest_packet_count; + u64 dest_byte_count; + u32 dest_new_packet_count; + u32 dest_new_byte_count; + u32 reason; /* reason for stats sync message, i.e. destroy, flush, period sync */ + u64 delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */ +}; + +/* + * connection mark structure + */ +struct sfe_connection_mark { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; + u32 mark; +}; + +/* + * Expose the hook for the receive processing. + */ +extern int (*athrs_fast_nat_recv)(struct sk_buff *skb); + +/* + * Expose what should be a static flag in the TCP connection tracker. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) +extern int nf_ct_tcp_no_window_check; +#endif +/* + * This callback will be called in a timer + * at 100 times per second to sync stats back to + * Linux connection track. + * + * A RCU lock is taken to prevent this callback + * from unregistering. + */ +typedef void (*sfe_sync_rule_callback_t)(struct sfe_connection_sync *); + +/* + * IPv4 APIs used by connection manager + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv4_create_rule(struct sfe_connection_create *sic); +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv4_update_rule(struct sfe_connection_create *sic); +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark); + +#ifdef SFE_SUPPORT_IPV6 +/* + * IPv6 APIs used by connection manager + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv6_create_rule(struct sfe_connection_create *sic); +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv6_update_rule(struct sfe_connection_create *sic); +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark); +#else +static inline int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + return 0; +} + +static inline int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + return 0; +} + +static inline void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + return; +} + +static inline void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + return; +} + +static inline void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback) +{ + return; +} + +static inline void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + return; +} + +static inline void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + return; +} +#endif + +/* + * sfe_ipv6_addr_equal() + * compare ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_ipv6_addr_equal(struct sfe_ipv6_addr *a, + struct sfe_ipv6_addr *b) +{ + return a->addr[0] == b->addr[0] && + a->addr[1] == b->addr[1] && + a->addr[2] == b->addr[2] && + a->addr[3] == b->addr[3]; +} + +/* + * sfe_ipv4_addr_equal() + * compare ipv4 address + * + * return: 1, equal; 0, no equal + */ +#define sfe_ipv4_addr_equal(a, b) ((u32)(a) == (u32)(b)) + +/* + * sfe_addr_equal() + * compare ipv4 or ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_addr_equal(sfe_ip_addr_t *a, + sfe_ip_addr_t *b, int is_v4) +{ + return is_v4 ? sfe_ipv4_addr_equal(a->ip, b->ip) : sfe_ipv6_addr_equal(a->ip6, b->ip6); +} diff --git a/shortcut-fe/shortcut-fe/src/sfe_ipv4.c b/shortcut-fe/shortcut-fe/src/sfe_ipv4.c new file mode 100644 index 000000000..cdcdd66dd --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/sfe_ipv4.c @@ -0,0 +1,3618 @@ +/* + * sfe_ipv4.c + * Shortcut forwarding engine - IPv4 edition. + * + * Copyright (c) 2013-2016, 2019-2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV4_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV4_UNALIGNED_IP_HEADER 1 +#if SFE_IPV4_UNALIGNED_IP_HEADER +#define SFE_IPV4_UNALIGNED_STRUCT __attribute__((packed)) +#else +#define SFE_IPV4_UNALIGNED_STRUCT +#endif + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV4_UNALIGNED_STRUCT; + +#define SFE_IPV4_DSCP_MASK 0x3 +#define SFE_IPV4_DSCP_SHIFT 2 + +/* + * An IPv4 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ihl:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + ihl:4; +#else +#error "Please fix " +#endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; + + /* + * The options start here. + */ +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv4 TCP connection match additional data. + */ +struct sfe_ipv4_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv4 connection matching entry. + */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv4 connection matching structure. + */ +struct sfe_ipv4_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv4_connection_match *next; + struct sfe_ipv4_connection_match *prev; + struct sfe_ipv4_connection *connection; + struct sfe_ipv4_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in *connection */ + struct sfe_ipv4_connection_match *active_next; + struct sfe_ipv4_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + __be32 match_src_ip; /* Source IP address */ + __be32 match_dest_ip; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_NF_FLOW_COOKIE + u32 flow_cookie; /* used flow cookie, for debug */ +#endif +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv4_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + __be32 xlate_src_ip; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + u16 xlate_src_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after source translation */ + + __be32 xlate_dest_ip; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + u16 xlate_dest_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv4_connection { + struct sfe_ipv4_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv4_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + __be32 src_ip; /* Src IP addr pre-translation */ + __be32 src_ip_xlate; /* Src IP addr post-translation */ + __be32 dest_ip; /* Dest IP addr pre-translation */ + __be32 dest_ip_xlate; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv4_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv4_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv4_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv4_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv4 connections and hash table size information. + */ +#define SFE_IPV4_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT) +#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1) + +#ifdef CONFIG_NF_FLOW_COOKIE +#define SFE_FLOW_COOKIE_SIZE 2048 +#define SFE_FLOW_COOKIE_MASK 0x7ff + +struct sfe_flow_cookie_entry { + struct sfe_ipv4_connection_match *match; + unsigned long last_clean_time; +}; +#endif + +enum sfe_ipv4_exception_events { + SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV4_EXCEPTION_EVENT_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_CLONED_SKB_UNSHARE_ERROR, + SFE_IPV4_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV4_HEADER_INCOMPLETE", + "ICMP_IPV4_NON_V4", + "ICMP_IPV4_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV4_UDP_HEADER_INCOMPLETE", + "ICMP_IPV4_TCP_HEADER_INCOMPLETE", + "ICMP_IPV4_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V4", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL", + "CLONED_SKB_UNSHARE_ERROR" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv4 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv4_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv4_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv4_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv4_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ +#ifdef CONFIG_NF_FLOW_COOKIE + struct sfe_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; + /* flow cookie table*/ + flow_cookie_set_func_t flow_cookie_set_func; + /* function used to configure flow cookie in hardware*/ + int flow_cookie_enable; + /* Enable/disable flow cookie at runtime */ +#endif + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv4 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv4 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv4 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv4 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv4 connection flushes */ + u32 packets_forwarded; /* Number of IPv4 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv4 packets not forwarded */ + u32 exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv4 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv4 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv4 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv4 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv4 connection flushes */ + u64 packets_forwarded64; /* Number of IPv4 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv4 packets not forwarded */ + u64 exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv4; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv4_debug_xml_states { + SFE_IPV4_DEBUG_XML_STATE_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_STATS, + SFE_IPV4_DEBUG_XML_STATE_END, + SFE_IPV4_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv4_debug_xml_write_state { + enum sfe_ipv4_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws); + +static struct sfe_ipv4 __si; + +/* + * sfe_ipv4_gen_ip_csum() + * Generate the IP checksum for an IPv4 header. + * + * Note that this function assumes that we have only 20 bytes of IP header. + */ +static inline u16 sfe_ipv4_gen_ip_csum(struct sfe_ipv4_ip_hdr *iph) +{ + u32 sum; + u16 *i = (u16 *)iph; + + iph->check = 0; + + /* + * Generate the sum. + */ + sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9]; + + /* + * Fold it to ones-complement form. + */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + + return (u16)sum ^ 0xffff; +} + +/* + * sfe_ipv4_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + size_t dev_addr = (size_t)dev; + u32 hash = ((u32)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection_match() + * Get the IPv4 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv4_connection_match * +sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (cm->match_src_ip == src_ip) + && (cm->match_dest_ip == dest_ip) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || cm->match_src_ip != src_ip + || cm->match_dest_ip != dest_ip + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv4_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv4_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm) +{ + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 src_ip_hi = cm->match_src_ip >> 16; + u16 src_ip_lo = cm->match_src_ip & 0xffff; + u32 xlate_src_ip = ~cm->xlate_src_ip; + u16 xlate_src_ip_hi = xlate_src_ip >> 16; + u16 xlate_src_ip_lo = xlate_src_ip & 0xffff; + u16 xlate_src_port = ~cm->xlate_src_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = src_ip_hi + src_ip_lo + cm->match_src_port + + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 dest_ip_hi = cm->match_dest_ip >> 16; + u16 dest_ip_lo = cm->match_dest_ip & 0xffff; + u32 xlate_dest_ip = ~cm->xlate_dest_ip; + u16 xlate_dest_ip_hi = xlate_dest_ip >> 16; + u16 xlate_dest_ip_lo = xlate_dest_ip & 0xffff; + u16 xlate_dest_port = ~cm->xlate_dest_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port + + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = ~cm->match_src_ip + cm->xlate_src_ip; + if (adj < cm->xlate_src_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_partial_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = ~cm->match_dest_ip + cm->xlate_dest_ip; + if (adj < cm->xlate_dest_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_partial_csum_adjustment = (u16)adj; + } + +} + +/* + * sfe_ipv4_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, + struct sfe_ipv4_connection_match *cm) +{ + struct sfe_ipv4_connection_match **hash_head; + struct sfe_ipv4_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; + +#ifdef CONFIG_NF_FLOW_COOKIE + if (!si->flow_cookie_enable) + return; + + /* + * Configure hardware to put a flow cookie in packet of this flow, + * then we can accelerate the lookup process when we received this packet. + */ + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { + flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + if (!func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port, conn_match_idx)) { + entry->match = cm; + cm->flow_cookie = conn_match_idx; + } + } + rcu_read_unlock(); + + break; + } + } +#endif +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm) +{ +#ifdef CONFIG_NF_FLOW_COOKIE + if (si->flow_cookie_enable) { + /* + * Tell hardware that we no longer need a flow cookie in packet of this flow + */ + unsigned int conn_match_idx; + + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if (cm == entry->match) { + flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port, 0); + } + rcu_read_unlock(); + + cm->flow_cookie = 0; + entry->match = NULL; + entry->last_clean_time = jiffies; + break; + } + } + } +#endif + + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv4_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_hash(u8 protocol, __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + u32 hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection() + * Get the IPv4 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, u32 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection *c; + unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (c->src_ip == src_ip) + && (c->dest_ip == dest_ip) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || c->src_ip != src_ip + || c->dest_ip != dest_ip + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv4_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv4_find_sfe_ipv4_connection(si, mark->protocol, + mark->src_ip.ip, mark->src_port, + mark->dest_ip.ip, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + struct sfe_ipv4_connection **hash_head; + struct sfe_ipv4_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match); + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection() + * Remove a sfe_ipv4_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match); + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv4_sync_sfe_ipv4_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 0; + sis->protocol = c->protocol; + sis->src_ip.ip = c->src_ip; + sis->src_ip_xlate.ip = c->src_ip_xlate; + sis->dest_ip.ip = c->dest_ip; + sis->dest_ip_xlate.ip = c->dest_ip_xlate; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv4_flush_sfe_ipv4_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, + struct sfe_ipv4_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv4_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_udp_hdr *udph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + u8 ttl; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_udp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_CLONED_SKB_UNSHARE_ERROR]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } + + /* + * Update the iph and udph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv4_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv4_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_tcp_hdr *tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + u8 ttl; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_tcp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcp_hdr))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv4_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV4_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_CLONED_SKB_UNSHARE_ERROR]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } + + /* + * Update the iph and tcph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl) +{ + struct icmphdr *icmph; + struct sfe_ipv4_ip_hdr *icmp_iph; + unsigned int icmp_ihl_words; + unsigned int icmp_ihl; + u32 *icmp_trans_h; + struct sfe_ipv4_udp_hdr *icmp_udph; + struct sfe_ipv4_tcp_hdr *icmp_tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection *c; + u32 pull_len = sizeof(struct icmphdr) + ihl; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmphdr *)(skb->data + ihl); + if ((icmph->type != ICMP_DEST_UNREACH) + && (icmph->type != ICMP_TIME_EXCEEDED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type); + return 0; + } + + /* + * Do we have the full embedded IP header? + */ + len -= sizeof(struct icmphdr); + pull_len += sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv4_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + /* + * Do we have the full embedded IP header, including any options? + */ + icmp_ihl_words = icmp_iph->ihl; + icmp_ihl = icmp_ihl_words << 2; + pull_len += icmp_ihl - sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded header not large enough for IP options\n"); + return 0; + } + + len -= icmp_ihl; + icmp_trans_h = ((u32 *)icmp_iph) + icmp_ihl_words; + + /* + * Handle the embedded transport layer header. + */ + switch (icmp_iph->protocol) { + case IPPROTO_UDP: + /* + * We should have 8 bytes of UDP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded UDP header\n"); + return 0; + } + + icmp_udph = (struct sfe_ipv4_udp_hdr *)icmp_trans_h; + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + /* + * We should have 8 bytes of TCP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded TCP header\n"); + return 0; + } + + icmp_tcph = (struct sfe_ipv4_tcp_hdr *)icmp_trans_h; + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol); + return 0; + } + + src_ip = icmp_iph->saddr; + dest_ip = icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv4_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv4 *si = &__si; + unsigned int len; + unsigned int tot_len; + unsigned int frag_off; + unsigned int ihl; + bool flush_on_find; + bool ip_options; + struct sfe_ipv4_ip_hdr *iph; + u32 protocol; + + /* + * Check that we have space for an IP header here. + */ + len = skb->len; + if (unlikely(!pskb_may_pull(skb, sizeof(struct sfe_ipv4_ip_hdr)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Check that our "total length" is large enough for an IP header. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tot_len = ntohs(iph->tot_len); + if (unlikely(tot_len < sizeof(struct sfe_ipv4_ip_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u is too short\n", tot_len); + return 0; + } + + /* + * Is our IP version wrong? + */ + if (unlikely(iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + if (unlikely(tot_len > len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len); + return 0; + } + + /* + * Do we have a non-initial fragment? + */ + frag_off = ntohs(iph->frag_off); + if (unlikely(frag_off & IP_OFFSET)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + + /* + * If we have a (first) fragment then mark it to cause any connection to flush. + */ + flush_on_find = unlikely(frag_off & IP_MF) ? true : false; + + /* + * Do we have any IP options? That's definite a slow path! If we do have IP + * options we need to recheck our header size. + */ + ihl = iph->ihl << 2; + ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_ip_hdr)) ? true : false; + if (unlikely(ip_options)) { + if (unlikely(len < ihl)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl); + return 0; + } + + flush_on_find = true; + } + + protocol = iph->protocol; + if (IPPROTO_UDP == protocol) { + return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == protocol) { + return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMP == protocol) { + return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol); + return 0; +} + +static void +sfe_ipv4_update_tcp_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection_match *orig_cm; + struct sfe_ipv4_connection_match *repl_cm; + struct sfe_ipv4_tcp_connection_match *orig_tcp; + struct sfe_ipv4_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +static void +sfe_ipv4_update_protocol_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv4_update_tcp_state(c, sic); + break; + } +} + +void sfe_ipv4_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + sfe_ipv4_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv4_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv4_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pxM:%pI4:%u, d: %s:%pxM:%pI4:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, &sic->src_ip.ip, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, &sic->dest_ip.ip, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip = sic->src_ip.ip; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip = sic->dest_ip.ip; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip = sic->src_ip_xlate.ip; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip = sic->dest_ip_xlate.ip; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV4_DSCP_SHIFT; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + original_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip = sic->dest_ip_xlate.ip; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip = sic->src_ip_xlate.ip; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip = sic->dest_ip.ip; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip = sic->src_ip.ip; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV4_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + reply_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (sic->dest_ip.ip != sic->dest_ip_xlate.ip || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (sic->src_ip.ip != sic->src_ip_xlate.ip || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip = sic->src_ip.ip; + c->src_ip_xlate = sic->src_ip_xlate.ip; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip = sic->dest_ip.ip; + c->dest_ip_xlate = sic->dest_ip_xlate.ip; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv4_connection_match_compute_translations(original_cm); + sfe_ipv4_connection_match_compute_translations(reply_cm); + sfe_ipv4_insert_sfe_ipv4_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pxM(%pxM):%pI4(%pI4):%u(%u)\n" + " d: %s:%pxM(%pxM):%pI4(%pI4):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + &sic->src_ip.ip, &sic->src_ip_xlate.ip, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + &sic->dest_ip.ip, &sic->dest_ip_xlate.ip, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv4_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip.ip, sid->src_port, + sid->dest_ip.ip, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip, ntohs(sid->src_port), + &sid->dest_ip, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip.ip, ntohs(sid->src_port), + &sid->dest_ip.ip, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv4_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_get_debug_dev() + */ +static ssize_t sfe_ipv4_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv4_get_debug_dev, NULL); + +/* + * sfe_ipv4_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv4_periodic_sync() + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) +static void sfe_ipv4_periodic_sync(unsigned long arg) +#else +static void sfe_ipv4_periodic_sync(struct timer_list *tl) +#endif +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg; +#else + struct sfe_ipv4 *si = from_timer(si, tl, timer); +#endif + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + struct sfe_ipv4_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * sfe_ipv4_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + __be32 src_ip; + __be32 src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + __be32 dest_ip; + __be32 dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; +#ifdef CONFIG_NF_FLOW_COOKIE + int src_flow_cookie, dst_flow_cookie; +#endif + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip; + src_ip_xlate = c->src_ip_xlate; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip; + dest_ip_xlate = c->dest_ip_xlate; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie = original_cm->flow_cookie; + dst_flow_cookie = reply_cm->flow_cookie; +#endif + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie, dst_flow_cookie, +#endif + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv4_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = { + sfe_ipv4_debug_dev_read_start, + sfe_ipv4_debug_dev_read_connections_start, + sfe_ipv4_debug_dev_read_connections_connection, + sfe_ipv4_debug_dev_read_connections_end, + sfe_ipv4_debug_dev_read_exceptions_start, + sfe_ipv4_debug_dev_read_exceptions_exception, + sfe_ipv4_debug_dev_read_exceptions_end, + sfe_ipv4_debug_dev_read_stats, + sfe_ipv4_debug_dev_read_end, +}; + +/* + * sfe_ipv4_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv4_debug_xml_write_state *ws; + struct sfe_ipv4 *si = &__si; + + ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv4_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv4_debug_dev_open() + */ +static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (!ws) { + ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV4_DEBUG_XML_STATE_START; + file->private_data = ws; + } + + return 0; +} + +/* + * sfe_ipv4_debug_dev_release() + */ +static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv4_debug_dev_fops = { + .read = sfe_ipv4_debug_dev_read, + .write = sfe_ipv4_debug_dev_write, + .open = sfe_ipv4_debug_dev_open, + .release = sfe_ipv4_debug_dev_release +}; + +#ifdef CONFIG_NF_FLOW_COOKIE +/* + * sfe_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_register_flow_cookie_cb(flow_cookie_set_func_t cb) +{ + struct sfe_ipv4 *si = &__si; + + BUG_ON(!cb); + + if (si->flow_cookie_set_func) { + return -1; + } + + rcu_assign_pointer(si->flow_cookie_set_func, cb); + return 0; +} + +/* + * sfe_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_unregister_flow_cookie_cb(flow_cookie_set_func_t cb) +{ + struct sfe_ipv4 *si = &__si; + + RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); + return 0; +} + +/* + * sfe_ipv4_get_flow_cookie() + */ +static ssize_t sfe_ipv4_get_flow_cookie(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); +} + +/* + * sfe_ipv4_set_flow_cookie() + */ +static ssize_t sfe_ipv4_set_flow_cookie(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct sfe_ipv4 *si = &__si; + strict_strtol(buf, 0, (long int *)&si->flow_cookie_enable); + + return size; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_flow_cookie_attr = + __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv4_get_flow_cookie, sfe_ipv4_set_flow_cookie); +#endif /*CONFIG_NF_FLOW_COOKIE*/ + +/* + * sfe_ipv4_init() + */ +static int __init sfe_ipv4_init(void) +{ + struct sfe_ipv4 *si = &__si; + int result = -1; + + DEBUG_INFO("SFE IPv4 init\n"); + + /* + * Create sys/sfe_ipv4 + */ + si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL); + if (!si->sys_sfe_ipv4) { + DEBUG_ERROR("failed to register sfe_ipv4\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + +#ifdef CONFIG_NF_FLOW_COOKIE + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); + if (result) { + DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); + goto exit3; + } +#endif /* CONFIG_NF_FLOW_COOKIE */ + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit4; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si); +#else + timer_setup(&si->timer, sfe_ipv4_periodic_sync, 0); +#endif + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit4: +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); + +exit3: +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv4); + +exit1: + return result; +} + +/* + * sfe_ipv4_exit() + */ +static void __exit sfe_ipv4_exit(void) +{ + struct sfe_ipv4 *si = &__si; + + DEBUG_INFO("SFE IPv4 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv4"); + +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv4); + +} + +module_init(sfe_ipv4_init) +module_exit(sfe_ipv4_exit) + +EXPORT_SYMBOL(sfe_ipv4_recv); +EXPORT_SYMBOL(sfe_ipv4_create_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv4_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv4_mark_rule); +EXPORT_SYMBOL(sfe_ipv4_update_rule); +#ifdef CONFIG_NF_FLOW_COOKIE +EXPORT_SYMBOL(sfe_register_flow_cookie_cb); +EXPORT_SYMBOL(sfe_unregister_flow_cookie_cb); +#endif + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/shortcut-fe/src/sfe_ipv6.c b/shortcut-fe/shortcut-fe/src/sfe_ipv6.c new file mode 100644 index 000000000..ae3306693 --- /dev/null +++ b/shortcut-fe/shortcut-fe/src/sfe_ipv6.c @@ -0,0 +1,3625 @@ +/* + * sfe_ipv6.c + * Shortcut forwarding engine - IPv6 support. + * + * Copyright (c) 2015-2016, 2019-2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV6_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV6_UNALIGNED_IP_HEADER 1 +#if SFE_IPV6_UNALIGNED_IP_HEADER +#define SFE_IPV6_UNALIGNED_STRUCT __attribute__((packed)) +#else +#define SFE_IPV6_UNALIGNED_STRUCT +#endif + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_DSCP_MASK 0xf03f +#define SFE_IPV6_DSCP_SHIFT 2 + +/* + * An IPv6 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 priority:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + priority:4; +#else +#error "Please fix " +#endif + __u8 flow_lbl[3]; + __be16 payload_len; + __u8 nexthdr; + __u8 hop_limit; + struct sfe_ipv6_addr saddr; + struct sfe_ipv6_addr daddr; + + /* + * The extension header start here. + */ +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_EXT_HDR_HOP 0 +#define SFE_IPV6_EXT_HDR_ROUTING 43 +#define SFE_IPV6_EXT_HDR_FRAG 44 +#define SFE_IPV6_EXT_HDR_ESP 50 +#define SFE_IPV6_EXT_HDR_AH 51 +#define SFE_IPV6_EXT_HDR_NONE 59 +#define SFE_IPV6_EXT_HDR_DST 60 +#define SFE_IPV6_EXT_HDR_MH 135 + +/* + * fragmentation header + */ + +struct sfe_ipv6_frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; +}; + +#define SFE_IPV6_FRAG_OFFSET 0xfff8 + +/* + * generic IPv6 extension header + */ +struct sfe_ipv6_ext_hdr { + __u8 next_hdr; + __u8 hdr_len; + __u8 padding[6]; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV6_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv6 TCP connection match additional data. + */ +struct sfe_ipv6_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv6 connection matching entry. + */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv6 connection matching structure. + */ +struct sfe_ipv6_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv6_connection_match *next; + struct sfe_ipv6_connection_match *prev; + struct sfe_ipv6_connection *connection; + struct sfe_ipv6_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in connection */ + struct sfe_ipv6_connection_match *active_next; + struct sfe_ipv6_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + struct sfe_ipv6_addr match_src_ip[1]; /* Source IP address */ + struct sfe_ipv6_addr match_dest_ip[1]; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_NF_FLOW_COOKIE + u32 flow_cookie; /* used flow cookie, for debug */ +#endif +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv6_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + struct sfe_ipv6_addr xlate_src_ip[1]; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + struct sfe_ipv6_addr xlate_dest_ip[1]; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv6_connection { + struct sfe_ipv6_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv6_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + struct sfe_ipv6_addr src_ip[1]; /* Src IP addr pre-translation */ + struct sfe_ipv6_addr src_ip_xlate[1]; /* Src IP addr post-translation */ + struct sfe_ipv6_addr dest_ip[1]; /* Dest IP addr pre-translation */ + struct sfe_ipv6_addr dest_ip_xlate[1]; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv6_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv6_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv6_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv6_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv6 connections and hash table size information. + */ +#define SFE_IPV6_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV6_CONNECTION_HASH_SIZE (1 << SFE_IPV6_CONNECTION_HASH_SHIFT) +#define SFE_IPV6_CONNECTION_HASH_MASK (SFE_IPV6_CONNECTION_HASH_SIZE - 1) + +#ifdef CONFIG_NF_FLOW_COOKIE +#define SFE_FLOW_COOKIE_SIZE 2048 +#define SFE_FLOW_COOKIE_MASK 0x7ff + +struct sfe_ipv6_flow_cookie_entry { + struct sfe_ipv6_connection_match *match; + unsigned long last_clean_time; +}; +#endif + +enum sfe_ipv6_exception_events { + SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV6_EXCEPTION_EVENT_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL, + SFE_IPV6_EXCEPTION_EVENT_CLONED_SKB_UNSHARE_ERROR, + SFE_IPV6_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv6_exception_events_string[SFE_IPV6_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV6_HEADER_INCOMPLETE", + "ICMP_IPV6_NON_V6", + "ICMP_IPV6_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV6_UDP_HEADER_INCOMPLETE", + "ICMP_IPV6_TCP_HEADER_INCOMPLETE", + "ICMP_IPV6_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V6", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL", + "FLOW_COOKIE_ADD_FAIL", + "CLONED_SKB_UNSHARE_ERROR" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv6 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv6_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv6_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv6_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv6_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv6_connection *conn_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv6_connection_match *conn_match_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ +#ifdef CONFIG_NF_FLOW_COOKIE + struct sfe_ipv6_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; + /* flow cookie table*/ + sfe_ipv6_flow_cookie_set_func_t flow_cookie_set_func; + /* function used to configure flow cookie in hardware*/ + int flow_cookie_enable; + /* Enable/disable flow cookie at runtime */ +#endif + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv6 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv6 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv6 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv6 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv6 connection flushes */ + u32 packets_forwarded; /* Number of IPv6 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv6 packets not forwarded */ + u32 exception_events[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv6 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv6 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv6 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv6 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv6 connection flushes */ + u64 packets_forwarded64; /* Number of IPv6 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv6 packets not forwarded */ + u64 exception_events64[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv6; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv6_debug_xml_states { + SFE_IPV6_DEBUG_XML_STATE_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_STATS, + SFE_IPV6_DEBUG_XML_STATE_END, + SFE_IPV6_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv6_debug_xml_write_state { + enum sfe_ipv6_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv6_debug_xml_write_method_t)(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws); + +static struct sfe_ipv6 __si6; + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, struct device_attribute *attr, char *buf); + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv6_get_debug_dev, NULL); + +/* + * sfe_ipv6_is_ext_hdr() + * check if we recognize ipv6 extension header + */ +static inline bool sfe_ipv6_is_ext_hdr(u8 hdr) +{ + return (hdr == SFE_IPV6_EXT_HDR_HOP) || + (hdr == SFE_IPV6_EXT_HDR_ROUTING) || + (hdr == SFE_IPV6_EXT_HDR_FRAG) || + (hdr == SFE_IPV6_EXT_HDR_AH) || + (hdr == SFE_IPV6_EXT_HDR_DST) || + (hdr == SFE_IPV6_EXT_HDR_MH); +} + +/* + * sfe_ipv6_change_dsfield() + * change dscp field in IPv6 packet + */ +static inline void sfe_ipv6_change_dsfield(struct sfe_ipv6_ip_hdr *iph, u8 dscp) +{ + __be16 *p = (__be16 *)iph; + + *p = ((*p & htons(SFE_IPV6_DSCP_MASK)) | htons((u16)dscp << 4)); +} + +/* + * sfe_ipv6_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_match_hash(struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + size_t dev_addr = (size_t)dev; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = ((u32)dev_addr) ^ hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection_match() + * Get the IPv6 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv6_connection_match * +sfe_ipv6_find_connection_match(struct sfe_ipv6 *si, struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv6_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (sfe_ipv6_addr_equal(cm->match_src_ip, src_ip)) + && (sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip)) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || !sfe_ipv6_addr_equal(cm->match_src_ip, src_ip) + || !sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip) + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv6_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv6_connection_match_update_summary_stats(struct sfe_ipv6_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv6_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv6_connection_match_compute_translations(struct sfe_ipv6_connection_match *cm) +{ + u32 diff[9]; + u32 *idx_32; + u16 *idx_16; + + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_src_ip->addr[0]; + *(idx_32++) = cm->match_src_ip->addr[1]; + *(idx_32++) = cm->match_src_ip->addr[2]; + *(idx_32++) = cm->match_src_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_src_port; + *(idx_16++) = ~cm->xlate_src_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_src_ip->addr[0]; + *(idx_32++) = ~cm->xlate_src_ip->addr[1]; + *(idx_32++) = ~cm->xlate_src_ip->addr[2]; + *(idx_32++) = ~cm->xlate_src_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_dest_ip->addr[0]; + *(idx_32++) = cm->match_dest_ip->addr[1]; + *(idx_32++) = cm->match_dest_ip->addr[2]; + *(idx_32++) = cm->match_dest_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_dest_port; + *(idx_16++) = ~cm->xlate_dest_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_dest_ip->addr[0]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[1]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[2]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } +} + +/* + * sfe_ipv6_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv6_update_summary_stats(struct sfe_ipv6 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV6_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv6_insert_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_insert_connection_match(struct sfe_ipv6 *si, + struct sfe_ipv6_connection_match *cm) +{ + struct sfe_ipv6_connection_match **hash_head; + struct sfe_ipv6_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; + +#ifdef CONFIG_NF_FLOW_COOKIE + if (!si->flow_cookie_enable || !(cm->flags & (SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC | SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST))) + return; + + /* + * Configure hardware to put a flow cookie in packet of this flow, + * then we can accelerate the lookup process when we received this packet. + */ + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { + sfe_ipv6_flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + if (!func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, + cm->match_dest_ip->addr, cm->match_dest_port, conn_match_idx)) { + entry->match = cm; + cm->flow_cookie = conn_match_idx; + } else { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL]++; + } + } + rcu_read_unlock(); + + break; + } + } +#endif +} + +/* + * sfe_ipv6_remove_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_remove_connection_match(struct sfe_ipv6 *si, struct sfe_ipv6_connection_match *cm) +{ +#ifdef CONFIG_NF_FLOW_COOKIE + if (si->flow_cookie_enable) { + /* + * Tell hardware that we no longer need a flow cookie in packet of this flow + */ + unsigned int conn_match_idx; + + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if (cm == entry->match) { + sfe_ipv6_flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, + cm->match_dest_ip->addr, cm->match_dest_port, 0); + } + rcu_read_unlock(); + + cm->flow_cookie = 0; + entry->match = NULL; + entry->last_clean_time = jiffies; + break; + } + } + } +#endif + + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv6_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_hash(u8 protocol, struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection() + * Get the IPv6 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv6_connection *sfe_ipv6_find_connection(struct sfe_ipv6 *si, u32 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection *c; + unsigned int conn_idx = sfe_ipv6_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (sfe_ipv6_addr_equal(c->src_ip, src_ip)) + && (sfe_ipv6_addr_equal(c->dest_ip, dest_ip)) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || !sfe_ipv6_addr_equal(c->src_ip, src_ip) + || !sfe_ipv6_addr_equal(c->dest_ip, dest_ip) + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv6_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv6_find_connection(si, mark->protocol, + mark->src_ip.ip6, mark->src_port, + mark->dest_ip.ip6, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv6_insert_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_insert_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + struct sfe_ipv6_connection **hash_head; + struct sfe_ipv6_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv6_insert_connection_match(si, c->original_match); + sfe_ipv6_insert_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv6_remove_connection() + * Remove a sfe_ipv6_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_remove_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv6_remove_connection_match(si, c->reply_match); + sfe_ipv6_remove_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv6_gen_sync_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv6_gen_sync_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 1; + sis->protocol = c->protocol; + sis->src_ip.ip6[0] = c->src_ip[0]; + sis->src_ip_xlate.ip6[0] = c->src_ip_xlate[0]; + sis->dest_ip.ip6[0] = c->dest_ip[0]; + sis->dest_ip_xlate.ip6[0] = c->dest_ip_xlate[0]; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv6_flush_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv6_flush_connection(struct sfe_ipv6 *si, + struct sfe_ipv6_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv6_gen_sync_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv6_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv6_recv_udp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_udp_hdr *udph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_udp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_CLONED_SKB_UNSHARE_ERROR]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } + + /* + * Update the iph and udph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip[0]; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip[0]; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv6_process_tcp_option_sack(const struct sfe_ipv6_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv6_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv6_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv6_recv_tcp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_tcp_hdr *tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_tcp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv6_tcp_hdr))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv6_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv6_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV6_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_CLONED_SKB_UNSHARE_ERROR]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } + + /* + * Update the iph and tcph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip[0]; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip[0]; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv6_recv_icmp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl) +{ + struct icmp6hdr *icmph; + struct sfe_ipv6_ip_hdr *icmp_iph; + struct sfe_ipv6_udp_hdr *icmp_udph; + struct sfe_ipv6_tcp_hdr *icmp_tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection *c; + u8 next_hdr; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, ihl + sizeof(struct icmp6hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmp6hdr *)(skb->data + ihl); + if ((icmph->icmp6_type != ICMPV6_DEST_UNREACH) + && (icmph->icmp6_type != ICMPV6_TIME_EXCEED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->icmp6_type); + return 0; + } + + /* + * Do we have the full embedded IP header? + * We should have 8 bytes of next L4 header - that's enough to identify + * the connection. + */ + len -= sizeof(struct icmp6hdr); + ihl += sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ip_hdr) + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv6_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + len -= sizeof(struct sfe_ipv6_ip_hdr); + ihl += sizeof(struct sfe_ipv6_ip_hdr); + next_hdr = icmp_iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + len -= ext_hdr_len; + ihl += ext_hdr_len; + /* + * We should have 8 bytes of next header - that's enough to identify + * the connection. + */ + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + next_hdr = ext_hdr->next_hdr; + } + + /* + * Handle the embedded transport layer header. + */ + switch (next_hdr) { + case IPPROTO_UDP: + icmp_udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + icmp_tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", next_hdr); + return 0; + } + + src_ip = &icmp_iph->saddr; + dest_ip = &icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv6_find_connection_match(si, dev, icmp_iph->nexthdr, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv6_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv6 *si = &__si6; + unsigned int len; + unsigned int payload_len; + unsigned int ihl = sizeof(struct sfe_ipv6_ip_hdr); + bool flush_on_find = false; + struct sfe_ipv6_ip_hdr *iph; + u8 next_hdr; + + /* + * Check that we have space for an IP header and an uplayer header here. + */ + len = skb->len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Is our IP version wrong? + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + if (unlikely(iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + payload_len = ntohs(iph->payload_len); + if (unlikely(payload_len > (len - ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("payload_len: %u, exceeds len: %u\n", payload_len, (len - (unsigned int)sizeof(struct sfe_ipv6_ip_hdr))); + return 0; + } + + next_hdr = iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + ihl += ext_hdr_len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + flush_on_find = true; + next_hdr = ext_hdr->next_hdr; + } + + if (IPPROTO_UDP == next_hdr) { + return sfe_ipv6_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == next_hdr) { + return sfe_ipv6_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMPV6 == next_hdr) { + return sfe_ipv6_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", next_hdr); + return 0; +} + +/* + * sfe_ipv6_update_tcp_state() + * update TCP window variables. + */ +static void +sfe_ipv6_update_tcp_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection_match *orig_cm; + struct sfe_ipv6_connection_match *repl_cm; + struct sfe_ipv6_tcp_connection_match *orig_tcp; + struct sfe_ipv6_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +/* + * sfe_ipv6_update_protocol_state() + * update protocol specified state machine. + */ +static void +sfe_ipv6_update_protocol_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv6_update_tcp_state(c, sic); + break; + } +} + +/* + * sfe_ipv6_update_rule() + * update forwarding rule after rule is created. + */ +void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + sfe_ipv6_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv6_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pxM:%pI6:%u, d: %s:%pxM:%pI6:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_ip.ip6, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, sic->dest_ip.ip6, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv6_connection *)kmalloc(sizeof(struct sfe_ipv6_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip[0] = sic->src_ip.ip6[0]; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip[0] = sic->dest_ip.ip6[0]; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip[0] = sic->src_ip_xlate.ip6[0]; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip[0] = sic->dest_ip_xlate.ip6[0]; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV6_DSCP_SHIFT; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + original_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip[0] = sic->dest_ip_xlate.ip6[0]; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip[0] = sic->src_ip_xlate.ip6[0]; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip[0] = sic->dest_ip.ip6[0]; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip[0] = sic->src_ip.ip6[0]; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV6_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + reply_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (!sfe_ipv6_addr_equal(sic->dest_ip.ip6, sic->dest_ip_xlate.ip6) || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (!sfe_ipv6_addr_equal(sic->src_ip.ip6, sic->src_ip_xlate.ip6) || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip[0] = sic->src_ip.ip6[0]; + c->src_ip_xlate[0] = sic->src_ip_xlate.ip6[0]; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip[0] = sic->dest_ip.ip6[0]; + c->dest_ip_xlate[0] = sic->dest_ip_xlate.ip6[0]; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv6_connection_match_compute_translations(original_cm); + sfe_ipv6_connection_match_compute_translations(reply_cm); + sfe_ipv6_insert_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n" + " d: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + sic->src_ip.ip6, sic->src_ip_xlate.ip6, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + sic->dest_ip.ip6, sic->dest_ip_xlate.ip6, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv6_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv6_find_connection(si, sid->protocol, sid->src_ip.ip6, sid->src_port, + sid->dest_ip.ip6, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv6_remove_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv6_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sfe_ipv6_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv6_remove_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv6_periodic_sync() + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) +static void sfe_ipv6_periodic_sync(unsigned long arg) +#else +static void sfe_ipv6_periodic_sync(struct timer_list *tl) +#endif +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + struct sfe_ipv6 *si = (struct sfe_ipv6 *)arg; +#else + struct sfe_ipv6 *si = from_timer(si, tl, timer); +#endif + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + struct sfe_ipv6_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv6_gen_sync_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +/* + * sfe_ipv6_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_connection(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + struct sfe_ipv6_addr src_ip; + struct sfe_ipv6_addr src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + struct sfe_ipv6_addr dest_ip; + struct sfe_ipv6_addr dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; +#ifdef CONFIG_NF_FLOW_COOKIE + int src_flow_cookie, dst_flow_cookie; +#endif + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip[0]; + src_ip_xlate = c->src_ip_xlate[0]; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip[0]; + dest_ip_xlate = c->dest_ip_xlate[0]; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie = original_cm->flow_cookie; + dst_flow_cookie = reply_cm->flow_cookie; +#endif + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie, dst_flow_cookie, +#endif + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_exception(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv6_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV6_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_stats(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv6_debug_xml_write_method_t sfe_ipv6_debug_xml_write_methods[SFE_IPV6_DEBUG_XML_STATE_DONE] = { + sfe_ipv6_debug_dev_read_start, + sfe_ipv6_debug_dev_read_connections_start, + sfe_ipv6_debug_dev_read_connections_connection, + sfe_ipv6_debug_dev_read_connections_end, + sfe_ipv6_debug_dev_read_exceptions_start, + sfe_ipv6_debug_dev_read_exceptions_exception, + sfe_ipv6_debug_dev_read_exceptions_end, + sfe_ipv6_debug_dev_read_stats, + sfe_ipv6_debug_dev_read_end, +}; + +/* + * sfe_ipv6_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv6_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv6_debug_xml_write_state *ws; + struct sfe_ipv6 *si = &__si6; + + ws = (struct sfe_ipv6_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV6_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv6_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv6_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv6_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv6_debug_dev_open() + */ +static int sfe_ipv6_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + return 0; + } + + ws = kzalloc(sizeof(struct sfe_ipv6_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV6_DEBUG_XML_STATE_START; + file->private_data = ws; + + return 0; +} + +/* + * sfe_ipv6_debug_dev_release() + */ +static int sfe_ipv6_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv6_debug_dev_fops = { + .read = sfe_ipv6_debug_dev_read, + .write = sfe_ipv6_debug_dev_write, + .open = sfe_ipv6_debug_dev_open, + .release = sfe_ipv6_debug_dev_release +}; + +#ifdef CONFIG_NF_FLOW_COOKIE +/* + * sfe_ipv6_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_ipv6_register_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) +{ + struct sfe_ipv6 *si = &__si6; + + BUG_ON(!cb); + + if (si->flow_cookie_set_func) { + return -1; + } + + rcu_assign_pointer(si->flow_cookie_set_func, cb); + return 0; +} + +/* + * sfe_ipv6_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_ipv6_unregister_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) +{ + struct sfe_ipv6 *si = &__si6; + + RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); + return 0; +} + +/* + * sfe_ipv6_get_flow_cookie() + */ +static ssize_t sfe_ipv6_get_flow_cookie(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); +} + +/* + * sfe_ipv6_set_flow_cookie() + */ +static ssize_t sfe_ipv6_set_flow_cookie(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct sfe_ipv6 *si = &__si6; + strict_strtol(buf, 0, (long int *)&si->flow_cookie_enable); + + return size; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_flow_cookie_attr = + __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv6_get_flow_cookie, sfe_ipv6_set_flow_cookie); +#endif /*CONFIG_NF_FLOW_COOKIE*/ + +/* + * sfe_ipv6_init() + */ +static int __init sfe_ipv6_init(void) +{ + struct sfe_ipv6 *si = &__si6; + int result = -1; + + DEBUG_INFO("SFE IPv6 init\n"); + + /* + * Create sys/sfe_ipv6 + */ + si->sys_sfe_ipv6 = kobject_create_and_add("sfe_ipv6", NULL); + if (!si->sys_sfe_ipv6) { + DEBUG_ERROR("failed to register sfe_ipv6\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + +#ifdef CONFIG_NF_FLOW_COOKIE + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); + if (result) { + DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); + goto exit3; + } +#endif /* CONFIG_NF_FLOW_COOKIE */ + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv6", &sfe_ipv6_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit4; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + setup_timer(&si->timer, sfe_ipv6_periodic_sync, (unsigned long)si); +#else + timer_setup(&si->timer, sfe_ipv6_periodic_sync, 0); +#endif + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit4: +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); + +exit3: +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv6); + +exit1: + return result; +} + +/* + * sfe_ipv6_exit() + */ +static void __exit sfe_ipv6_exit(void) +{ + struct sfe_ipv6 *si = &__si6; + + DEBUG_INFO("SFE IPv6 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv6_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv6"); + +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv6); +} + +module_init(sfe_ipv6_init) +module_exit(sfe_ipv6_exit) + +EXPORT_SYMBOL(sfe_ipv6_recv); +EXPORT_SYMBOL(sfe_ipv6_create_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv6_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv6_mark_rule); +EXPORT_SYMBOL(sfe_ipv6_update_rule); +#ifdef CONFIG_NF_FLOW_COOKIE +EXPORT_SYMBOL(sfe_ipv6_register_flow_cookie_cb); +EXPORT_SYMBOL(sfe_ipv6_unregister_flow_cookie_cb); +#endif + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv6 support"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/simulated-driver/Makefile b/shortcut-fe/simulated-driver/Makefile new file mode 100644 index 000000000..ecf9c41bd --- /dev/null +++ b/shortcut-fe/simulated-driver/Makefile @@ -0,0 +1,60 @@ +# +# Copyright (c) 2015,2016 The Linux Foundation. All rights reserved. +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all copies. +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=shortcut-fe-simulated-driver +PKG_RELEASE:=1 + +PKG_SOURCE_URL:=https://source.codeaurora.org/quic/qsdk/oss/lklm/shortcut-fe +PKG_SOURCE_PROTO:=git +PKG_SOURCE_DATE:=2021-03-17 +PKG_SOURCE_VERSION:=697977d8d0ccf0ab596e5692d08608a75dd7f33d +PKG_MIRROR_HASH:=659fa82a431e15af797a6c7069faeee02810453ad8b576c51c29f95a1761a045 + +include $(INCLUDE_DIR)/package.mk + +define KernelPackage/shortcut-fe-drv + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=@TARGET_ipq806x||TARGET_ipq807x +kmod-shortcut-fe + KCONFIG:= \ + CONFIG_NET_CLS_ACT=y \ + CONFIG_XFRM=y + TITLE:=Simulated sfe driver for ECM + FILES:=$(PKG_BUILD_DIR)/simulated-driver/shortcut-fe-drv.ko +endef + +define KernelPackage/shortcut-fe-drv/Description +Simulated sfe driver which act as an adapter to convert message +between a connection manager and the SFE core engine. +endef + +define Build/Compile + $(MAKE) $(PKG_JOBS) -C "$(LINUX_DIR)" \ + $(KERNEL_MAKE_FLAGS) \ + $(PKG_MAKE_FLAGS) \ + M="$(PKG_BUILD_DIR)/simulated-driver" \ + EXTRA_CFLAGS="-DSFE_SUPPORT_IPV6" \ + modules +endef + +define Build/InstallDev + $(INSTALL_DIR) $(1)/usr/include/shortcut-fe + $(CP) -rf $(PKG_BUILD_DIR)/simulated-driver/sfe_drv.h $(1)/usr/include/shortcut-fe +endef + +$(eval $(call KernelPackage,shortcut-fe-drv)) diff --git a/shortcut-fe/simulated-driver/patches/200-nss-qdisc-support.patch b/shortcut-fe/simulated-driver/patches/200-nss-qdisc-support.patch new file mode 100644 index 000000000..638ad8a84 --- /dev/null +++ b/shortcut-fe/simulated-driver/patches/200-nss-qdisc-support.patch @@ -0,0 +1,11 @@ +--- ./simulated-driver/sfe_drv.c.orig 2020-06-16 12:49:47.680153371 +0800 ++++ ./simulated-driver/sfe_drv.c 2020-06-16 12:50:18.540153371 +0800 +@@ -1167,7 +1167,7 @@ int sfe_drv_recv(struct sk_buff *skb) + * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet + * We can not accelerate this packet. + */ +- if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) { ++ if (dev->ingress_queue && !(skb->tc_verd_qca_nss & TC_NCLS)) { + return 0; + } + #endif