diff --git a/fast-classifier/Makefile b/fast-classifier/Makefile new file mode 100644 index 000000000..58dd06e01 --- /dev/null +++ b/fast-classifier/Makefile @@ -0,0 +1,10 @@ +obj-$(CONFIG_FAST_CLASSIFIER) += fast-classifier.o + +ifeq ($(SFE_SUPPORT_IPV6),) +SFE_SUPPORT_IPV6=y +endif +ccflags-$(SFE_SUPPORT_IPV6) += -DSFE_SUPPORT_IPV6 + +ccflags-y += -I$(obj)/../shortcut-fe + +obj ?= . diff --git a/fast-classifier/fast-classifier.c b/fast-classifier/fast-classifier.c new file mode 100644 index 000000000..d79404cba --- /dev/null +++ b/fast-classifier/fast-classifier.c @@ -0,0 +1,1892 @@ +/* + * fast-classifier.c + * Shortcut forwarding engine connection manager. + * fast-classifier + * + * Copyright (c) 2013-2018 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "fast-classifier.h" + +typedef enum fast_classifier_exception { + FAST_CL_EXCEPTION_PACKET_BROADCAST, + FAST_CL_EXCEPTION_PACKET_MULTICAST, + FAST_CL_EXCEPTION_NO_IIF, + FAST_CL_EXCEPTION_NO_CT, + FAST_CL_EXCEPTION_CT_NO_TRACK, + FAST_CL_EXCEPTION_CT_NO_CONFIRM, + FAST_CL_EXCEPTION_CT_IS_ALG, + FAST_CL_EXCEPTION_IS_IPV4_MCAST, + FAST_CL_EXCEPTION_IS_IPV6_MCAST, + FAST_CL_EXCEPTION_TCP_NOT_ASSURED, + FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED, + FAST_CL_EXCEPTION_UNKNOW_PROTOCOL, + FAST_CL_EXCEPTION_NO_SRC_DEV, + FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV, + FAST_CL_EXCEPTION_NO_DEST_DEV, + FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV, + FAST_CL_EXCEPTION_NO_BRIDGE, + FAST_CL_EXCEPTION_LOCAL_OUT, + FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION, + FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL, + FAST_CL_EXCEPTION_CT_DESTROY_MISS, + FAST_CL_EXCEPTION_MAX +} fast_classifier_exception_t; + +static char *fast_classifier_exception_events_string[FAST_CL_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT", + "WAIT_FOR_ACCELERATION", + "UPDATE_PROTOCOL_FAIL", + "CT_DESTROY_MISS", +}; + +/* + * Per-module structure. + */ +struct fast_classifier { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_fast_classifier; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[FAST_CL_EXCEPTION_MAX]; +}; + +static struct fast_classifier __sc; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { + .type = NLA_UNSPEC, + .len = sizeof(struct fast_classifier_tuple) + }, +}; + +static struct genl_multicast_group fast_classifier_genl_mcgrp[] = { + { + .name = FAST_CLASSIFIER_GENL_MCGRP, + }, +}; + +static struct genl_family fast_classifier_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = FAST_CLASSIFIER_GENL_HDRSIZE, + .name = FAST_CLASSIFIER_GENL_NAME, + .version = FAST_CLASSIFIER_GENL_VERSION, + .maxattr = FAST_CLASSIFIER_A_MAX, +}; + +static int fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info); +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, struct netlink_callback *cb); + +static struct genl_ops fast_classifier_gnl_ops[] = { + { + .cmd = FAST_CLASSIFIER_C_OFFLOAD, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = fast_classifier_offload_genl_msg, + .dumpit = NULL, + }, + { + .cmd = FAST_CLASSIFIER_C_OFFLOADED, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, + { + .cmd = FAST_CLASSIFIER_C_DONE, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, +}; + +static atomic_t offload_msgs = ATOMIC_INIT(0); +static atomic_t offload_no_match_msgs = ATOMIC_INIT(0); +static atomic_t offloaded_msgs = ATOMIC_INIT(0); +static atomic_t done_msgs = ATOMIC_INIT(0); + +static atomic_t offloaded_fail_msgs = ATOMIC_INIT(0); +static atomic_t done_fail_msgs = ATOMIC_INIT(0); + +/* + * Accelerate incoming packets destined for bridge device + * If a incoming packet is ultimatly destined for + * a bridge device we will first see the packet coming + * from the phyiscal device, we can skip straight to + * processing the packet like it came from the bridge + * for some more performance gains + * + * This only works when the hook is above the bridge. We + * only implement ingress for now, because for egress we + * want to have the bridge devices qdiscs be used. + */ +static bool skip_to_bridge_ingress; + +/* + * fast_classifier_incr_exceptions() + * increase an exception counter. + */ +static inline void fast_classifier_incr_exceptions(fast_classifier_exception_t except) +{ + struct fast_classifier *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * fast_classifier_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int fast_classifier_recv(struct sk_buff *skb) +{ + struct net_device *dev; + struct net_device *master_dev = NULL; + int ret = 0; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * Process packet like it arrived on the bridge device + */ + if (skip_to_bridge_ingress && + (dev->priv_flags & IFF_BRIDGE_PORT)) { + master_dev = sfe_dev_get_master(dev); + if (!master_dev) { + DEBUG_WARN("master dev is NULL %s\n", dev->name); + goto rx_exit; + } + dev = master_dev; + } + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv4_recv(dev, skb); + + } else if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv6_recv(dev, skb); + + } else { + DEBUG_TRACE("not IP packet\n"); + } + +rx_exit: + if (master_dev) { + dev_put(master_dev); + } + + return ret; +} + +/* + * fast_classifier_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool fast_classifier_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, bool is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this works when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + + rcu_read_lock(); + neigh = sfe_dst_get_neighbour(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", addr); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr); + } + + return false; +} + +static DEFINE_SPINLOCK(sfe_connections_lock); + +struct sfe_connection { + struct hlist_node hl; + struct sfe_connection_create *sic; + struct nf_conn *ct; + int hits; + int offload_permit; + int offloaded; + bool is_v4; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; + +static int sfe_connections_size; + +#define FC_CONN_HASH_ORDER 13 +static DEFINE_HASHTABLE(fc_conn_ht, FC_CONN_HASH_ORDER); + +static u32 fc_conn_hash(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, bool is_v4) +{ + u32 idx, cnt = ((is_v4 ? sizeof(saddr->ip) : sizeof(saddr->ip6))/sizeof(u32)); + u32 hash = 0; + + for (idx = 0; idx < cnt; idx++) { + hash ^= ((u32 *)saddr)[idx] ^ ((u32 *)daddr)[idx]; + } + + return hash ^ (sport | (dport << 16)); +} + +/* + * fast_classifier_update_protocol() + * Update sfe_ipv4_create struct with new protocol information before we offload + */ +static int fast_classifier_update_protocol(struct sfe_connection_create *p_sic, struct nf_conn *ct) +{ + switch (p_sic->protocol) { + case IPPROTO_TCP: + p_sic->src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + p_sic->src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + p_sic->src_td_end = ct->proto.tcp.seen[0].td_end; + p_sic->src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + p_sic->dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + p_sic->dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + p_sic->dest_td_end = ct->proto.tcp.seen[1].td_end; + p_sic->dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; + + if (nf_ct_tcp_no_window_check + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + p_sic->flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock(&ct->lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &p_sic->src_ip, ntohs(p_sic->src_port), + &p_sic->dest_ip, ntohs(p_sic->dest_port)); + return 0; + } + spin_unlock(&ct->lock); + break; + + case IPPROTO_UDP: + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", p_sic->protocol); + return 0; + } + + return 1; +} + +/* fast_classifier_send_genl_msg() + * Function to send a generic netlink message + */ +static void fast_classifier_send_genl_msg(int msg, struct fast_classifier_tuple *fc_msg) +{ + struct sk_buff *skb; + int rc; + int buf_len; + int total_len; + void *msg_head; + + /* + * Calculate our packet payload size. + * Start with our family header. + */ + buf_len = fast_classifier_gnl_family.hdrsize; + + /* + * Add the nla_total_size of each attribute we're going to nla_put(). + */ + buf_len += nla_total_size(sizeof(*fc_msg)); + + /* + * Lastly we need to add space for the NL message header since + * genlmsg_new only accounts for the GENL header and not the + * outer NL header. To do this, we use a NL helper function which + * calculates the total size of a netlink message given a payload size. + * Note this value does not include the GENL header, but that's + * added automatically by genlmsg_new. + */ + total_len = nlmsg_total_size(buf_len); + skb = genlmsg_new(total_len, GFP_ATOMIC); + if (!skb) + return; + + msg_head = genlmsg_put(skb, 0, 0, &fast_classifier_gnl_family, 0, msg); + if (!msg_head) { + nlmsg_free(skb); + return; + } + + rc = nla_put(skb, FAST_CLASSIFIER_A_TUPLE, sizeof(struct fast_classifier_tuple), fc_msg); + if (rc != 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(3, 19 , 0)) + rc = genlmsg_end(skb, msg_head); + if (rc < 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } +#else + genlmsg_end(skb, msg_head); + +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + rc = genlmsg_multicast(&fast_classifier_gnl_family, skb, 0, 0, GFP_ATOMIC); +#else + rc = genlmsg_multicast(skb, 0, fast_classifier_genl_mcgrp[0].id, GFP_ATOMIC); +#endif + switch (msg) { + case FAST_CLASSIFIER_C_OFFLOADED: + if (rc == 0) { + atomic_inc(&offloaded_msgs); + } else { + atomic_inc(&offloaded_fail_msgs); + } + break; + case FAST_CLASSIFIER_C_DONE: + if (rc == 0) { + atomic_inc(&done_msgs); + } else { + atomic_inc(&done_fail_msgs); + } + break; + default: + DEBUG_ERROR("fast-classifer: Unknown message type sent!\n"); + break; + } + + DEBUG_TRACE("Notify NL message %d ", msg); + if (fc_msg->ethertype == AF_INET) { + DEBUG_TRACE("sip=%pI4 dip=%pI4 ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + } else { + DEBUG_TRACE("sip=%pI6 dip=%pI6 ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + } + DEBUG_TRACE("protocol=%d sport=%d dport=%d smac=%pM dmac=%pM\n", + fc_msg->proto, fc_msg->sport, fc_msg->dport, fc_msg->smac, fc_msg->dmac); +} + +/* + * fast_classifier_find_conn() + * find a connection object in the hash table + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip, daddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_sb_find_conn() + * find a connection object in the hash table according to information of packet + * if not found, reverse the tuple and try again. + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_sb_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port_xlate == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, daddr, is_v4)) { + return conn; + } + } + + /* + * Reverse the tuple and try again + */ + key = fc_conn_hash(daddr, saddr, dport, sport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == dport && + p_sic->dest_port_xlate == sport && + sfe_addr_equal(&p_sic->src_ip, daddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, saddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_add_conn() + * add a connection object in the hash table if no duplicate + * @conn connection to add + * @return conn if successful, NULL if duplicate + */ +static struct sfe_connection * +fast_classifier_add_conn(struct sfe_connection *conn) +{ + struct sfe_connection_create *sic = conn->sic; + u32 key; + + spin_lock_bh(&sfe_connections_lock); + if (fast_classifier_find_conn(&sic->src_ip, &sic->dest_ip, sic->src_port, + sic->dest_port, sic->protocol, conn->is_v4)) { + spin_unlock_bh(&sfe_connections_lock); + return NULL; + } + + key = fc_conn_hash(&sic->src_ip, &sic->dest_ip, + sic->src_port, sic->dest_port, conn->is_v4); + + hash_add(fc_conn_ht, &conn->hl, key); + sfe_connections_size++; + spin_unlock_bh(&sfe_connections_lock); + + DEBUG_TRACE(" -> adding item to sfe_connections, new size: %d\n", sfe_connections_size); + + if (conn->is_v4) { + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + } else { + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + } + + return conn; +} + +/* + * fast_classifier_offload_genl_msg() + * Called from user space to offload a connection + */ +static int +fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *na; + struct fast_classifier_tuple *fc_msg; + struct sfe_connection *conn; + + na = info->attrs[FAST_CLASSIFIER_A_TUPLE]; + fc_msg = nla_data(na); + + if (fc_msg->ethertype == AF_INET) { + DEBUG_TRACE("want to offload: %d-%d, %pI4, %pI4, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + } else { + DEBUG_TRACE("want to offload: %d-%d, %pI6, %pI6, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + } + + spin_lock_bh(&sfe_connections_lock); + conn = fast_classifier_sb_find_conn((sfe_ip_addr_t *)&fc_msg->src_saddr, + (sfe_ip_addr_t *)&fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->proto, + (fc_msg->ethertype == AF_INET)); + if (!conn) { + spin_unlock_bh(&sfe_connections_lock); + DEBUG_TRACE("REQUEST OFFLOAD NO MATCH\n"); + atomic_inc(&offload_no_match_msgs); + return 0; + } + + conn->offload_permit = 1; + spin_unlock_bh(&sfe_connections_lock); + atomic_inc(&offload_msgs); + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + return 0; +} + +/* + * fast_classifier_nl_genl_msg_DUMP() + * ignore fast_classifier_messages OFFLOADED and DONE + */ +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return 0; +} + +/* auto offload connection once we have this many packets*/ +static int offload_at_pkts = 128; + +/* + * fast_classifier_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int fast_classifier_post_routing(struct sk_buff *skb, bool is_v4) +{ + int ret; + struct sfe_connection_create sic; + struct sfe_connection_create *p_sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_dev_tmp; + struct net_device *dest_dev_tmp; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + struct sfe_connection *conn; + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + memset(&sic, 0, sizeof(sic)); + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + if (is_v4) { + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + } else { + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + } + + /* + * If we already have this connection in our list, skip it + * XXX: this may need to be optimized + */ + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port, sic.protocol, is_v4); + if (conn) { + conn->hits++; + + if (!conn->offloaded) { + if (conn->offload_permit || conn->hits >= offload_at_pkts) { + DEBUG_TRACE("OFFLOADING CONNECTION, TOO MANY HITS\n"); + + if (fast_classifier_update_protocol(conn->sic, conn->ct) == 0) { + spin_unlock_bh(&sfe_connections_lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL); + DEBUG_TRACE("UNKNOWN PROTOCOL OR CONNECTION CLOSING, SKIPPING\n"); + return NF_ACCEPT; + } + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + spin_unlock_bh(&sfe_connections_lock); + + ret = is_v4 ? sfe_ipv4_create_rule(conn->sic) : sfe_ipv6_create_rule(conn->sic); + if ((ret == 0) || (ret == -EADDRINUSE)) { + struct fast_classifier_tuple fc_msg; + + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&sic.src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&sic.dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&sic.src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&sic.dest_ip_xlate); + } + + fc_msg.proto = sic.protocol; + fc_msg.sport = sic.src_port; + fc_msg.dport = sic.dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_OFFLOADED, &fc_msg); + conn->offloaded = 1; + } + + return NF_ACCEPT; + } + } + + spin_unlock_bh(&sfe_connections_lock); + if (conn->offloaded) { + is_v4 ? sfe_ipv4_update_rule(conn->sic) : sfe_ipv6_update_rule(conn->sic); + } + + DEBUG_TRACE("FOUND, SKIPPING\n"); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION); + return NF_ACCEPT; + } + + spin_unlock_bh(&sfe_connections_lock); + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!fast_classifier_find_dev_and_mac_addr(&sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + src_dev = src_dev_tmp; + + if (!fast_classifier_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_DEV); + goto done1; + } + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + dest_dev = dest_dev_tmp; + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (skb->mark) { + DEBUG_TRACE("SKB MARK NON ZERO %x\n", skb->mark); + } + sic.mark = skb->mark; + + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (!conn) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + goto done4; + } + conn->hits = 0; + conn->offload_permit = 0; + conn->offloaded = 0; + conn->is_v4 = is_v4; + DEBUG_TRACE("Source MAC=%pM\n", sic.src_mac); + memcpy(conn->smac, sic.src_mac, ETH_ALEN); + memcpy(conn->dmac, sic.dest_mac_xlate, ETH_ALEN); + + p_sic = kmalloc(sizeof(*p_sic), GFP_ATOMIC); + if (!p_sic) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + kfree(conn); + goto done4; + } + + memcpy(p_sic, &sic, sizeof(sic)); + conn->sic = p_sic; + conn->ct = ct; + + if (!fast_classifier_add_conn(conn)) { + kfree(conn->sic); + kfree(conn); + } + + /* + * If we had bridge ports then release them too. + */ +done4: + if (dest_br_dev) { + dev_put(dest_br_dev); + } +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } +done2: + dev_put(dest_dev_tmp); +done1: + dev_put(src_dev_tmp); + + return NF_ACCEPT; +} + +/* + * fast_classifier_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, true); +} + +/* + * fast_classifier_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, false); +} + +/* + * fast_classifier_update_mark() + * updates the mark for a fast-classifier connection + */ +static void fast_classifier_update_mark(struct sfe_connection_mark *mark, bool is_v4) +{ + struct sfe_connection *conn; + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&mark->src_ip, &mark->dest_ip, + mark->src_port, mark->dest_port, + mark->protocol, is_v4); + if (conn) { + conn->sic->mark = mark->mark; + } + + spin_unlock_bh(&sfe_connections_lock); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * fast_classifier_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int fast_classifier_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int fast_classifier_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + struct sfe_connection *conn; + struct fast_classifier_tuple fc_msg; + int offloaded = 0; + bool is_v4; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + + /* + * If this is an untracked connection then we can't have any state either. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + is_v4 = true; + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + is_v4 = false; + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + return NOTIFY_DONE; + } + + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + /* + * Check for an updated mark + */ + if ((events & (1 << IPCT_MARK)) && (ct->mark != 0)) { + struct sfe_connection_mark mark; + + mark.protocol = sid.protocol; + mark.src_ip = sid.src_ip; + mark.dest_ip = sid.dest_ip; + mark.src_port = sid.src_port; + mark.dest_port = sid.dest_port; + mark.mark = ct->mark; + + is_v4 ? sfe_ipv4_mark_rule(&mark) : sfe_ipv6_mark_rule(&mark); + fast_classifier_update_mark(&mark, is_v4); + } + + /* + * We're only interested in destroy events at this point + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + if (is_v4) { + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port); + } else { + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port); + } + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port, sid.protocol, is_v4); + if (conn && conn->offloaded) { + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&conn->sic->dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&conn->sic->dest_ip_xlate); + } + + fc_msg.proto = conn->sic->protocol; + fc_msg.sport = conn->sic->src_port; + fc_msg.dport = conn->sic->dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + offloaded = 1; + } + + if (conn) { + DEBUG_TRACE("Free connection\n"); + + hash_del(&conn->hl); + sfe_connections_size--; + kfree(conn->sic); + kfree(conn); + } else { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_DESTROY_MISS); + } + + spin_unlock_bh(&sfe_connections_lock); + + is_v4 ? sfe_ipv4_destroy_rule(&sid) : sfe_ipv6_destroy_rule(&sid); + + if (offloaded) { + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_DONE, &fc_msg); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block fast_classifier_conntrack_notifier = { + .notifier_call = fast_classifier_conntrack_event, +}; +#else +static struct nf_ct_event_notifier fast_classifier_conntrack_notifier = { + .fcn = fast_classifier_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops fast_classifier_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__fast_classifier_ipv4_post_routing_hook), + SFE_IPV6_NF_POST_ROUTING_HOOK(__fast_classifier_ipv6_post_routing_hook), +}; + +/* + * fast_classifier_sync_rule() + * Synchronize a connection's state. + */ +static void fast_classifier_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Update packet count for ingress on bridge device + */ + if (skip_to_bridge_ingress) { + struct rtnl_link_stats64 nlstats; + nlstats.tx_packets = 0; + nlstats.tx_bytes = 0; + + if (sis->src_dev && IFF_EBRIDGE && + (sis->src_new_packet_count || sis->src_new_byte_count)) { + nlstats.rx_packets = sis->src_new_packet_count; + nlstats.rx_bytes = sis->src_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->src_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + if (sis->dest_dev && IFF_EBRIDGE && + (sis->dest_new_packet_count || sis->dest_new_byte_count)) { + nlstats.rx_packets = sis->dest_new_packet_count; + nlstats.rx_bytes = sis->dest_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->dest_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); + ct->timeout.expires += sis->delta_jiffies; + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * fast_classifier_device_event() + */ +static int fast_classifier_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet_event() + */ +static int fast_classifier_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet6_event() + */ +static int fast_classifier_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_get_offload_at_pkts() + */ +static ssize_t fast_classifier_get_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", offload_at_pkts); +} + +/* + * fast_classifier_set_offload_at_pkts() + */ +static ssize_t fast_classifier_set_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + offload_at_pkts = new; + + return size; +} + +/* + * fast_classifier_get_debug_info() + */ +static ssize_t fast_classifier_get_debug_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + size_t len = 0; + struct sfe_connection *conn; + u32 i; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + spin_lock_bh(&sfe_connections_lock); + len += scnprintf(buf, PAGE_SIZE - len, "size=%d offload=%d offload_no_match=%d" + " offloaded=%d done=%d offloaded_fail=%d done_fail=%d\n", + sfe_connections_size, + atomic_read(&offload_msgs), + atomic_read(&offload_no_match_msgs), + atomic_read(&offloaded_msgs), + atomic_read(&done_msgs), + atomic_read(&offloaded_fail_msgs), + atomic_read(&done_fail_msgs)); + sfe_hash_for_each(fc_conn_ht, i, node, conn, hl) { + len += scnprintf(buf + len, PAGE_SIZE - len, + (conn->is_v4 ? "o=%d, p=%d [%pM]:%pI4:%u %pI4:%u:[%pM] m=%08x h=%d\n" : "o=%d, p=%d [%pM]:%pI6:%u %pI6:%u:[%pM] m=%08x h=%d\n"), + conn->offloaded, + conn->sic->protocol, + conn->sic->src_mac, + &conn->sic->src_ip, + conn->sic->src_port, + &conn->sic->dest_ip, + conn->sic->dest_port, + conn->sic->dest_mac_xlate, + conn->sic->mark, + conn->hits); + } + spin_unlock_bh(&sfe_connections_lock); + + return len; +} + +/* + * fast_classifier_get_skip_bridge_ingress() + */ +static ssize_t fast_classifier_get_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", skip_to_bridge_ingress); +} + +/* + * fast_classifier_set_skip_bridge_ingress() + */ +static ssize_t fast_classifier_set_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + skip_to_bridge_ingress = new ? 1 : 0; + + return size; +} + +/* + * fast_classifier_get_exceptions + * dump exception counters + */ +static ssize_t fast_classifier_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct fast_classifier *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < FAST_CL_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", fast_classifier_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute fast_classifier_offload_at_pkts_attr = + __ATTR(offload_at_pkts, S_IWUSR | S_IRUGO, fast_classifier_get_offload_at_pkts, fast_classifier_set_offload_at_pkts); +static const struct device_attribute fast_classifier_debug_info_attr = + __ATTR(debug_info, S_IRUGO, fast_classifier_get_debug_info, NULL); +static const struct device_attribute fast_classifier_skip_bridge_ingress = + __ATTR(skip_to_bridge_ingress, S_IWUSR | S_IRUGO, fast_classifier_get_skip_bridge_ingress, fast_classifier_set_skip_bridge_ingress); +static const struct device_attribute fast_classifier_exceptions_attr = + __ATTR(exceptions, S_IRUGO, fast_classifier_get_exceptions, NULL); + +/* + * fast_classifier_init() + */ +static int __init fast_classifier_init(void) +{ + struct fast_classifier *sc = &__sc; + int result = -1; + + printk(KERN_ALERT "fast-classifier: starting up\n"); + DEBUG_INFO("SFE CM init\n"); + + hash_init(fc_conn_ht); + + /* + * Create sys/fast_classifier + */ + sc->sys_fast_classifier = kobject_create_and_add("fast_classifier", NULL); + if (!sc->sys_fast_classifier) { + DEBUG_ERROR("failed to register fast_classifier\n"); + goto exit1; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + if (result) { + DEBUG_ERROR("failed to register offload at pkgs: %d\n", result); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + if (result) { + DEBUG_ERROR("failed to register skip bridge on ingress: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + goto exit2; + } + + sc->dev_notifier.notifier_call = fast_classifier_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = fast_classifier_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = fast_classifier_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + + /* + * Register our netfilter hooks. + */ + result = nf_register_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + /* + * Register a notifier hook to get fast notifications of expired connections. + */ + result = nf_conntrack_register_notifier(&init_net, &fast_classifier_conntrack_notifier); + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + result = genl_register_family_with_ops_groups(&fast_classifier_gnl_family, + fast_classifier_gnl_ops, + fast_classifier_genl_mcgrp); + if (result) { + DEBUG_ERROR("failed to register genl ops: %d\n", result); + goto exit5; + } +#else + result = genl_register_family(&fast_classifier_gnl_family); + if (result) { + printk(KERN_CRIT "unable to register genl family\n"); + goto exit5; + } + + result = genl_register_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result) { + printk(KERN_CRIT "unable to register ops\n"); + goto exit6; + } + + result = genl_register_mc_group(&fast_classifier_gnl_family, + fast_classifier_genl_mcgrp); + if (result) { + printk(KERN_CRIT "unable to register multicast group\n"); + goto exit6; + } +#endif + + printk(KERN_ALERT "fast-classifier: registered\n"); + + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ + BUG_ON(athrs_fast_nat_recv); + RCU_INIT_POINTER(athrs_fast_nat_recv, fast_classifier_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(fast_classifier_sync_rule); + sfe_ipv6_register_sync_rule_callback(fast_classifier_sync_rule); + return 0; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) +exit6: + genl_unregister_family(&fast_classifier_gnl_family); +#endif + +exit5: +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); + +exit4: +#endif + nf_unregister_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + +exit3: + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_exceptions_attr.attr); + +exit2: + kobject_put(sc->sys_fast_classifier); + +exit1: + return result; +} + +/* + * fast_classifier_exit() + */ +static void __exit fast_classifier_exit(void) +{ + struct fast_classifier *sc = &__sc; + int result = -1; + + DEBUG_INFO("SFE CM exit\n"); + printk(KERN_ALERT "fast-classifier: shutting down\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) + result = genl_unregister_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_ops\n"); + } +#endif + + result = genl_unregister_family(&fast_classifier_gnl_family); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_family\n"); + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); + +#endif + nf_unregister_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_fast_classifier); +} + +module_init(fast_classifier_init) +module_exit(fast_classifier_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/fast-classifier/fast-classifier.h b/fast-classifier/fast-classifier.h new file mode 100644 index 000000000..6b7a18cf6 --- /dev/null +++ b/fast-classifier/fast-classifier.h @@ -0,0 +1,57 @@ +/* + * User space header to send message to the fast classifier + * + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#define FAST_CLASSIFIER_GENL_VERSION (1) +#define FAST_CLASSIFIER_GENL_NAME "FC" +#define FAST_CLASSIFIER_GENL_MCGRP "FC_MCGRP" +#define FAST_CLASSIFIER_GENL_HDRSIZE (0) + +enum { + FAST_CLASSIFIER_A_UNSPEC, + FAST_CLASSIFIER_A_TUPLE, + __FAST_CLASSIFIER_A_MAX, +}; + +#define FAST_CLASSIFIER_A_MAX (__FAST_CLASSIFIER_A_MAX - 1) + +enum { + FAST_CLASSIFIER_C_UNSPEC, + FAST_CLASSIFIER_C_OFFLOAD, + FAST_CLASSIFIER_C_OFFLOADED, + FAST_CLASSIFIER_C_DONE, + __FAST_CLASSIFIER_C_MAX, +}; + +#define FAST_CLASSIFIER_C_MAX (__FAST_CLASSIFIER_C_MAX - 1) + +struct fast_classifier_tuple { + unsigned short ethertype; + unsigned char proto; + union { + struct in_addr in; + struct in6_addr in6; + } src_saddr; + union { + struct in_addr in; + struct in6_addr in6; + } dst_saddr; + unsigned short sport; + unsigned short dport; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; diff --git a/fast-classifier/nl_classifier_test.c b/fast-classifier/nl_classifier_test.c new file mode 100644 index 000000000..639417964 --- /dev/null +++ b/fast-classifier/nl_classifier_test.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#define NL_CLASSIFIER_GENL_VERSION 1 +#define NL_CLASSIFIER_GENL_FAMILY "FC" +#define NL_CLASSIFIER_GENL_GROUP "FC_MCGRP" +#define NL_CLASSIFIER_GENL_HDRSIZE 0 + +enum NL_CLASSIFIER_CMD { + NL_CLASSIFIER_CMD_UNSPEC, + NL_CLASSIFIER_CMD_ACCEL, + NL_CLASSIFIER_CMD_ACCEL_OK, + NL_CLASSIFIER_CMD_CONNECTION_CLOSED, + NL_CLASSIFIER_CMD_MAX, +}; + +enum NL_CLASSIFIER_ATTR { + NL_CLASSIFIER_ATTR_UNSPEC, + NL_CLASSIFIER_ATTR_TUPLE, + NL_CLASSIFIER_ATTR_MAX, +}; + +union nl_classifier_tuple_ip { + struct in_addr in; + struct in6_addr in6; +}; + +struct nl_classifier_tuple { + unsigned short af; + unsigned char proto; + union nl_classifier_tuple_ip src_ip; + union nl_classifier_tuple_ip dst_ip; + unsigned short sport; + unsigned short dport; + unsigned char smac[6]; + unsigned char dmac[6]; +}; + +struct nl_classifier_instance { + struct nl_sock *sock; + int family_id; + int group_id; + int stop; +}; + +struct nl_classifier_instance nl_cls_inst; + +static struct nla_policy nl_classifier_genl_policy[(NL_CLASSIFIER_ATTR_MAX+1)] = { + [NL_CLASSIFIER_ATTR_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void nl_classifier_dump_nl_tuple(struct nl_classifier_tuple *tuple) +{ + char ip_str[64]; + + printf("protocol = %s\n", (tuple->proto == IPPROTO_UDP) ? "udp" : ((tuple->proto == IPPROTO_TCP) ? "tcp" : "unknown")); + printf("source ip = %s\n", inet_ntop(tuple->af, &tuple->src_ip, ip_str, sizeof(ip_str))); + printf("destination ip = %s\n", inet_ntop(tuple->af, &tuple->dst_ip, ip_str, sizeof(ip_str))); + printf("source port = %d\n", ntohs(tuple->sport)); + printf("destination port = %d\n", ntohs(tuple->dport)); +} + +int nl_classifier_msg_recv(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[(NL_CLASSIFIER_ATTR_MAX+1)]; + + genlmsg_parse(nlh, NL_CLASSIFIER_GENL_HDRSIZE, attrs, NL_CLASSIFIER_ATTR_MAX, nl_classifier_genl_policy); + + switch (gnlh->cmd) { + case NL_CLASSIFIER_CMD_ACCEL_OK: + printf("Acceleration successful:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + case NL_CLASSIFIER_CMD_CONNECTION_CLOSED: + printf("Connection is closed:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + default: + printf("nl classifier received unknow message %d\n", gnlh->cmd); + } + + return NL_SKIP; +} + +void nl_classifier_offload(struct nl_classifier_instance *inst, + unsigned char proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short sport, + unsigned short dport, int af) +{ + struct nl_msg *msg; + int ret; + struct nl_classifier_tuple classifier_msg; + + memset(&classifier_msg, 0, sizeof(classifier_msg)); + classifier_msg.af = af; + classifier_msg.proto = proto; + memcpy(&classifier_msg.src_ip, src_saddr, (af == AF_INET ? 4 : 16)); + memcpy(&classifier_msg.dst_ip, dst_saddr, (af == AF_INET ? 4 : 16)); + classifier_msg.sport = sport; + classifier_msg.dport = dport; + + msg = nlmsg_alloc(); + if (!msg) { + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, inst->family_id, + NL_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + NL_CLASSIFIER_CMD_ACCEL, NL_CLASSIFIER_GENL_VERSION); + nla_put(msg, NL_CLASSIFIER_ATTR_TUPLE, sizeof(classifier_msg), &classifier_msg); + + ret = nl_send_auto(inst->sock, msg); + if (ret < 0) { + printf("send netlink message failed.\n"); + nlmsg_free(msg); + return; + } + + nlmsg_free(msg); + printf("nl classifier offload connection successful\n"); +} + +int nl_classifier_init(struct nl_classifier_instance *inst) +{ + int ret; + + inst->sock = nl_socket_alloc(); + if (!inst->sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(inst->sock); + + inst->family_id = genl_ctrl_resolve(inst->sock, NL_CLASSIFIER_GENL_FAMILY); + if (inst->family_id < 0) { + printf("Unable to resolve family %s\n", NL_CLASSIFIER_GENL_FAMILY); + goto init_failed; + } + + inst->group_id = genl_ctrl_resolve_grp(inst->sock, NL_CLASSIFIER_GENL_FAMILY, NL_CLASSIFIER_GENL_GROUP); + if (inst->group_id < 0) { + printf("Unable to resolve mcast group %s\n", NL_CLASSIFIER_GENL_GROUP); + goto init_failed; + } + + ret = nl_socket_add_membership(inst->sock, inst->group_id); + if (ret < 0) { + printf("Unable to add membership\n"); + goto init_failed; + } + + nl_socket_disable_seq_check(inst->sock); + nl_socket_modify_cb(inst->sock, NL_CB_VALID, NL_CB_CUSTOM, nl_classifier_msg_recv, NULL); + + printf("nl classifier init successful\n"); + return 0; + +init_failed: + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + return -1; +} + +void nl_classifier_exit(struct nl_classifier_instance *inst) +{ + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + printf("nl classifier exit successful\n"); +} + +int nl_classifier_parse_arg(int argc, char *argv[], unsigned char *proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short *sport, unsigned short *dport, int *af) +{ + int ret; + unsigned short port; + + if (argc < 7) { + printf("help: nl_classifier \n"); + return -1; + } + + if (0 == strncmp(argv[1], "v4", 2)) { + *af = AF_INET; + } else if (0 == strncmp(argv[1], "v6", 2)) { + *af = AF_INET6; + } else { + printf("Address family is not supported"); + return -1; + } + + if (0 == strncmp(argv[2], "udp", 3)) { + *proto = IPPROTO_UDP; + } else if (0 == strncmp(argv[2], "tcp", 3)) { + *proto = IPPROTO_TCP; + } else { + printf("Protocol is not supported"); + return -1; + } + + ret = inet_pton(*af, argv[3], src_saddr); + if (ret <= 0) { + printf("source ip has wrong format\n"); + return -1; + } + + ret = inet_pton(*af, argv[4], dst_saddr); + if (ret <= 0) { + printf("destination ip has wrong format\n"); + return -1; + } + + port = strtol(argv[5], NULL, 0); + *sport = htons(port); + port = strtol(argv[6], NULL, 0); + *dport = htons(port); + + printf("nl classifier parse arguments successful\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct nl_classifier_instance *inst = &nl_cls_inst; + unsigned char proto; + unsigned long src_addr[4]; + unsigned long dst_addr[4]; + unsigned short sport; + unsigned short dport; + int af; + int ret; + + ret = nl_classifier_parse_arg(argc, argv, &proto, src_addr, dst_addr, &sport, &dport, &af); + if (ret < 0) { + printf("Failed to parse arguments\n"); + return ret; + } + + ret = nl_classifier_init(inst); + if (ret < 0) { + printf("Unable to init generic netlink\n"); + return ret; + } + + nl_classifier_offload(inst, proto, src_addr, dst_addr, sport, dport, af); + + /* main loop to listen on message */ + while (!inst->stop) { + nl_recvmsgs_default(inst->sock); + } + + nl_classifier_exit(inst); + + return 0; +} diff --git a/fast-classifier/userspace_example.c b/fast-classifier/userspace_example.c new file mode 100644 index 000000000..4f4113d99 --- /dev/null +++ b/fast-classifier/userspace_example.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +static struct nl_sock *sock; +static struct nl_sock *sock_event; +static int family; +static int grp_id; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void dump_fc_tuple(struct fast_classifier_tuple *fc_msg) +{ + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; + + printf("TUPLE: %d, %s, %s, %d, %d" + " SMAC=%02x:%02x:%02x:%02x:%02x:%02x", + " DMAC=%02x:%02x:%02x:%02x:%02x:%02x\n", + fc_msg->proto, + inet_ntop(AF_INET, + &fc_msg->src_saddr.in.s_addr, + src_str, + INET_ADDRSTRLEN), + inet_ntop(AF_INET, + &fc_msg->dst_saddr.in.s_addr, + dst_str, + INET_ADDRSTRLEN), + fc_msg->sport, fc_msg->dport, + fc_msg->smac[0], fc_msg->smac[1], fc_msg->smac[2], + fc_msg->smac[3], fc_msg->smac[4], fc_msg->smac[5], + fc_msg->dmac[0], fc_msg->dmac[1], fc_msg->dmac[2], + fc_msg->dmac[3], fc_msg->dmac[4], fc_msg->dmac[5]); +} + +static int parse_cb(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[FAST_CLASSIFIER_A_MAX]; + + genlmsg_parse(nlh, 0, attrs, FAST_CLASSIFIER_A_MAX, fast_classifier_genl_policy); + + switch (gnlh->cmd) { + case FAST_CLASSIFIER_C_OFFLOADED: + printf("Got a offloaded message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + case FAST_CLASSIFIER_C_DONE: + printf("Got a done message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + } + + return NL_SKIP; +} + +int fast_classifier_init(void) +{ + int err; + + sock = nl_socket_alloc(); + if (!sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock); + + sock_event = nl_socket_alloc(); + if (!sock_event) { + nl_close(sock); + nl_socket_free(sock); + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock_event); + + family = genl_ctrl_resolve(sock, FAST_CLASSIFIER_GENL_NAME); + if (family < 0) { + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock); + nl_socket_free(sock_event); + printf("Unable to resolve family\n"); + return -1; + } + + grp_id = genl_ctrl_resolve_grp(sock, FAST_CLASSIFIER_GENL_NAME, + FAST_CLASSIFIER_GENL_MCGRP); + if (grp_id < 0) { + printf("Unable to resolve mcast group\n"); + return -1; + } + + err = nl_socket_add_membership(sock_event, grp_id); + if (err < 0) { + printf("Unable to add membership\n"); + return -1; + } + + nl_socket_disable_seq_check(sock_event); + nl_socket_modify_cb(sock_event, NL_CB_VALID, NL_CB_CUSTOM, parse_cb, NULL); + + return 0; +} + +void fast_classifier_close(void) +{ + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock_event); + nl_socket_free(sock); +} + +void fast_classifier_ipv4_offload(unsigned char proto, unsigned long src_saddr, + unsigned long dst_saddr, unsigned short sport, + unsigned short dport) +{ + struct nl_msg *msg; + int ret; +#ifdef DEBUG + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; +#endif + struct fast_classifier_tuple fc_msg; + +#ifdef DEBUG + printf("DEBUG: would offload: %d, %s, %s, %d, %d\n", proto, + inet_ntop(AF_INET, &src_saddr, src_str, INET_ADDRSTRLEN), + inet_ntop(AF_INET, &dst_saddr, dst_str, INET_ADDRSTRLEN), + sport, dport); +#endif + + fc_msg.proto = proto; + fc_msg.src_saddr.in.s_addr = src_saddr; + fc_msg.dst_saddr.in.s_addr = dst_saddr; + fc_msg.sport = sport; + fc_msg.dport = dport; + fc_msg.smac[0] = 'a'; + fc_msg.smac[1] = 'b'; + fc_msg.smac[2] = 'c'; + fc_msg.smac[3] = 'd'; + fc_msg.smac[4] = 'e'; + fc_msg.smac[5] = 'f'; + fc_msg.dmac[0] = 'f'; + fc_msg.dmac[1] = 'e'; + fc_msg.dmac[2] = 'd'; + fc_msg.dmac[3] = 'c'; + fc_msg.dmac[4] = 'b'; + fc_msg.dmac[5] = 'a'; + + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + msg = nlmsg_alloc(); + if (!msg) { + nl_socket_free(sock); + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, family, + FAST_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + FAST_CLASSIFIER_C_OFFLOAD, FAST_CLASSIFIER_GENL_VERSION); + nla_put(msg, 1, sizeof(fc_msg), &fc_msg); + + ret = nl_send_auto_complete(sock, msg); + + nlmsg_free(msg); + if (ret < 0) { + printf("nlmsg_free failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } + + ret = nl_wait_for_ack(sock); + if (ret < 0) { + printf("wait for ack failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } +} + +void fast_classifier_listen_for_messages(void) +{ + printf("waiting for netlink events\n"); + + while (1) { + nl_recvmsgs_default(sock_event); + } +} + +int main(int argc, char *argv[]) +{ + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + fast_classifier_ipv4_offload('a', 0, 0, 0, 0); + + /* this never returns */ + fast_classifier_listen_for_messages(); + + fast_classifier_close(); + + return 0; +} diff --git a/https-dns-proxy/Makefile b/https-dns-proxy/Makefile index 73d0a07cf..abfb4be7a 100755 --- a/https-dns-proxy/Makefile +++ b/https-dns-proxy/Makefile @@ -1,15 +1,15 @@ include $(TOPDIR)/rules.mk PKG_NAME:=https-dns-proxy -PKG_VERSION:=2021-06-03 -PKG_RELEASE:=1 +PKG_VERSION:=2021-11-22 +PKG_RELEASE:=3 PKG_SOURCE_PROTO:=git -PKG_SOURCE_URL:=https://github.com/aarond10/https_dns_proxy -PKG_SOURCE_DATE:=2021-06-03 -PKG_SOURCE_VERSION:=5651b984f770a8bcecb14aeffc224703f8f82586 -PKG_MIRROR_HASH:=b65161936269aa3117debad0fcfce157024726b78d7e7da77c226f7aa8da5b4d -PKG_MAINTAINER:=Stan Grishin +PKG_SOURCE_URL:=https://github.com/aarond10/https_dns_proxy/ +PKG_SOURCE_DATE:=2021-11-22 +PKG_SOURCE_VERSION:=9336fd6272d67e8bb6e304fa54f3139a3d26f08f +PKG_MIRROR_HASH:=60b1ddabaf1db3a9ee19f3294a1df714364d580cef5e3c2161363c371a557456 +PKG_MAINTAINER:=Stan Grishin PKG_LICENSE:=MIT PKG_LICENSE_FILES:=LICENSE @@ -38,7 +38,10 @@ define Package/https-dns-proxy/conffiles endef define Package/https-dns-proxy/install - $(INSTALL_DIR) $(1)/usr/sbin $(1)/etc/init.d ${1}/etc/config + $(INSTALL_DIR) $(1)/usr/sbin + $(INSTALL_DIR) $(1)/etc/init.d + $(INSTALL_DIR) ${1}/etc/config + $(INSTALL_DIR) $(1)/etc/hotplug.d/iface $(INSTALL_BIN) $(PKG_BUILD_DIR)/https_dns_proxy $(1)/usr/sbin/https-dns-proxy $(INSTALL_BIN) ./files/https-dns-proxy.init $(1)/etc/init.d/https-dns-proxy $(SED) "s|^\(PKG_VERSION\).*|\1='$(PKG_VERSION)-$(PKG_RELEASE)'|" $(1)/etc/init.d/https-dns-proxy diff --git a/https-dns-proxy/files/https-dns-proxy.config b/https-dns-proxy/files/https-dns-proxy.config index 3c5eecf4d..f08e03ca9 100755 --- a/https-dns-proxy/files/https-dns-proxy.config +++ b/https-dns-proxy/files/https-dns-proxy.config @@ -1,13 +1,16 @@ config main 'config' option update_dnsmasq_config '*' - -config https-dns-proxy - option bootstrap_dns '8.8.8.8,8.8.4.4' - option resolver_url 'https://dns.google/dns-query' - option listen_addr '127.0.0.1' - option listen_port '5053' - option user 'nobody' - option group 'nogroup' + option force_dns '1' + list force_dns_port '53' + list force_dns_port '853' +# ports listed below are used by some +# of the dnscrypt-proxy v1 resolvers +# list force_dns_port '553' +# list force_dns_port '1443' +# list force_dns_port '4343' +# list force_dns_port '4434' +# list force_dns_port '5443' +# list force_dns_port '8443' config https-dns-proxy option bootstrap_dns '1.1.1.1,1.0.0.1' @@ -16,3 +19,11 @@ config https-dns-proxy option listen_port '5054' option user 'nobody' option group 'nogroup' + +config https-dns-proxy + option bootstrap_dns '8.8.8.8,8.8.4.4' + option resolver_url 'https://dns.google/dns-query' + option listen_addr '127.0.0.1' + option listen_port '5053' + option user 'nobody' + option group 'nogroup' diff --git a/https-dns-proxy/files/https-dns-proxy.hotplug.iface b/https-dns-proxy/files/https-dns-proxy.hotplug.iface new file mode 100644 index 000000000..c25b9e26d --- /dev/null +++ b/https-dns-proxy/files/https-dns-proxy.hotplug.iface @@ -0,0 +1,6 @@ +#!/bin/sh + +if [ "$ACTION" = 'ifup' ] && [ "$INTERFACE" = 'wan' ] && /etc/init.d/https-dns-proxy enabled; then + logger -t "https-dns-proxy" "Restarting https-dns-proxy due to $ACTION of $INTERFACE" + /etc/init.d/https-dns-proxy restart +fi diff --git a/https-dns-proxy/files/https-dns-proxy.init b/https-dns-proxy/files/https-dns-proxy.init index 8b8680763..0b9a620a5 100755 --- a/https-dns-proxy/files/https-dns-proxy.init +++ b/https-dns-proxy/files/https-dns-proxy.init @@ -1,6 +1,6 @@ #!/bin/sh /etc/rc.common -# Copyright 2019-2020 Stan Grishin (stangri@melmac.net) -# shellcheck disable=SC2039,SC3043,SC3060 +# Copyright 2019-2022 Stan Grishin (stangri@melmac.ca) +# shellcheck disable=SC1091,SC2039,SC3043,SC3060 PKG_VERSION='dev-test' # shellcheck disable=SC2034 @@ -15,9 +15,52 @@ else EXTRA_COMMANDS='version' fi +readonly packageName='https-dns-proxy' +readonly serviceName="$packageName $PKG_VERSION" +readonly sharedMemoryOutput="/dev/shm/$packageName-output" +readonly _OK_='\033[0;32m\xe2\x9c\x93\033[0m' +readonly _FAIL_='\033[0;31m\xe2\x9c\x97\033[0m' readonly PROG=/usr/sbin/https-dns-proxy +readonly DEFAULT_BOOTSTRAP='1.1.1.1,1.0.0.1,2606:4700:4700::1111,2606:4700:4700::1001,8.8.8.8,8.8.4.4,2001:4860:4860::8888,2001:4860:4860::8844' +readonly canaryDomains='use-application-dns.net' dnsmasqConfig=''; forceDNS=''; forceDNSPorts=''; +str_contains() { [ -n "$1" ] &&[ -n "$2" ] && [ "${1//$2}" != "$1" ]; } +is_mac_address() { expr "$1" : '[0-9A-F][0-9A-F]:[0-9A-F][0-9A-F]:[0-9A-F][0-9A-F]:[0-9A-F][0-9A-F]:[0-9A-F][0-9A-F]:[0-9A-F][0-9A-F]$' >/dev/null; } +is_ipv4() { expr "$1" : '[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*$' >/dev/null; } +is_ipv6() { ! is_mac_address "$1" && str_contains "$1" ":"; } +output() { + local msg memmsg logmsg + [ -t 1 ] && printf "%b" "$@" + msg="${1//$serviceName /service }"; + if [ "$(printf "%b" "$msg" | wc -l)" -gt 0 ]; then + [ -s "$sharedMemoryOutput" ] && memmsg="$(cat "$sharedMemoryOutput")" + logmsg="$(printf "%b" "${memmsg}${msg}" | sed 's/\x1b\[[0-9;]*m//g')" + logger -t "$packageName" "$(printf "%b" "$logmsg")" + rm -f "$sharedMemoryOutput" + else + printf "%b" "$msg" >> "$sharedMemoryOutput" + fi +} +output_ok() { output "$_OK_"; } +output_okn() { output "${_OK_}\\n"; } +output_fail() { output "$_FAIL_"; } +output_failn() { output "${_FAIL_}\\n"; } +uci_add_list_if_new() { + local key="$1" value="$2" i + if [ -z "$value" ]; then + value="${key#*=}" + key="${key%=*}" + fi + [ -n "$key" ] && [ -n "$value" ] || return 1 + for i in $(uci -q get "$key"); do + [ "$i" = "$value" ] && return 0 + done + uci -q add_list "${key}=${value}" +} + +dnsmasq_restart() { [ -x /etc/init.d/dnsmasq ] || return 0; /etc/init.d/dnsmasq restart >/dev/null 2>&1; } + version() { echo "$PKG_VERSION"; } xappend() { param="$param $1"; } @@ -26,11 +69,10 @@ append_bool() { local section="$1" local option="$2" local value="$3" - local default="$4" + local default="${4:-0}" local _loctmp - [ -z "$default" ] && default="0" config_get_bool _loctmp "$section" "$option" "$default" - [ "$_loctmp" != "0" ] && xappend "$value" + [ "$_loctmp" -ne 0 ] && xappend "$value" } append_parm() { @@ -40,137 +82,191 @@ append_parm() { local default="$4" local _loctmp config_get _loctmp "$section" "$option" "$default" + [ -n "$_loctmp" ] && xappend "$switch $_loctmp" +} + +append_counter() { + local section="$1" + local option="$2" + local switch="$3" + local default="${4:-0}" + local _loctmp i + config_get _loctmp "$section" "$option" "$default" +# shellcheck disable=SC2086,SC2154 + for i in $(seq 1 $_loctmp); do + xappend '-v' + done +} + +append_bootstrap() { + local section="$1" + local option="$2" + local switch="$3" + local default="$4" + local _old_ifs="$IFS" + local _loctmp _newtmp i + config_get _loctmp "$section" "$option" "$default" [ -z "$_loctmp" ] && return 0 - xappend "$switch $_loctmp" + IFS=" ," + for i in $_loctmp; do + if { [ "$ipv6_resolvers_only" -eq 0 ] && is_ipv4 "$i"; } || \ + { [ "$ipv6_resolvers_only" -ne 0 ] && is_ipv6 "$i"; }; then + [ -z "$_newtmp" ] && _newtmp="$i" || _newtmp="${_newtmp},${i}" + fi + done + IFS="$_old_ifs" + [ -n "$_newtmp" ] && xappend "$switch $_newtmp" + [ "$ipv6_resolvers_only" -eq 0 ] && xappend '-4' } start_instance() { - local cfg="$1" param listen_addr listen_port i + local cfg="$1" param listen_addr listen_port ipv6_resolvers_only p + config_get_bool ipv6_resolvers_only "$cfg" 'use_ipv6_resolvers_only' '0' append_parm "$cfg" 'resolver_url' '-r' append_parm "$cfg" 'polling_interval' '-i' append_parm "$cfg" 'listen_addr' '-a' '127.0.0.1' - append_parm "$cfg" 'listen_port' '-p' "$p" + append_parm "$cfg" 'listen_port' '-p' "$port" append_parm "$cfg" 'dscp_codepoint' '-c' - append_parm "$cfg" 'bootstrap_dns' '-b' + append_bootstrap "$cfg" 'bootstrap_dns' '-b' "$DEFAULT_BOOTSTRAP" append_parm "$cfg" 'user' '-u' 'nobody' append_parm "$cfg" 'group' '-g' 'nogroup' append_parm "$cfg" 'proxy_server' '-t' append_parm "$cfg" 'logfile' '-l' append_bool "$cfg" 'use_http1' '-x' - config_get_bool ipv6_resolvers_only "$cfg" 'use_ipv6_resolvers_only' '0' - config_get verbosity "$cfg" 'verbosity' '0' - -# shellcheck disable=SC2086,SC2154 - for i in $(seq 1 $verbosity); do - xappend '-v' - done -# shellcheck disable=SC2154 - if [ "$ipv6_resolvers_only" = 0 ]; then - xappend '-4' - fi + append_counter "$cfg" 'verbosity' '-v' '0' procd_open_instance # shellcheck disable=SC2086 - procd_set_param command ${PROG} ${param} + procd_set_param command $PROG $param procd_set_param stderr 1 procd_set_param stdout 1 procd_set_param respawn - procd_close_instance - - config_get listen_addr "$cfg" 'listen_addr' '127.0.0.1' - config_get listen_port "$cfg" 'listen_port' "$p" - - if [ "$dnsmasqConfig" = "*" ]; then - config_load 'dhcp' - config_foreach dnsmasq_add_doh_server 'dnsmasq' "${listen_addr}" "${listen_port}" - elif [ -n "$dnsmasqConfig" ]; then - for i in $dnsmasqConfig; do - dnsmasq_add_doh_server "@dnsmasq[${i}]" "${listen_addr}" "${listen_port}" - done - fi - p="$((p+1))" -} - -is_force_dns_active() { iptables-save | grep -q -w -- '--dport 53'; } - -start_service() { - local p=5053 c - config_load 'https-dns-proxy' - config_get dnsmasqConfig 'config' 'update_dnsmasq_config' '*' - config_get_bool forceDNS 'config' 'force_dns' '1' - config_get forceDNSPorts 'config' 'force_dns_port' '53 853' - dhcp_backup 'create' - config_load 'https-dns-proxy' - config_foreach start_instance 'https-dns-proxy' if [ "$forceDNS" -ne 0 ]; then - procd_open_instance 'main' - procd_set_param command /bin/true - procd_set_param stdout 1 - procd_set_param stderr 1 procd_open_data json_add_array firewall - for c in $forceDNSPorts; do - if netstat -tuln | grep 'LISTEN' | grep ":${c}" >/dev/null 2>&1 || [ "$c" = "53" ]; then - json_add_object "" + for p in $forceDNSPorts; do + if netstat -tuln | grep 'LISTEN' | grep ":${p}" >/dev/null 2>&1 || [ "$p" = '53' ]; then + json_add_object '' json_add_string type redirect json_add_string target DNAT json_add_string src lan - json_add_string proto "tcp udp" - json_add_string src_dport "$c" - json_add_string dest_port "$c" + json_add_string proto 'tcp udp' + json_add_string src_dport "$p" + json_add_string dest_port "$p" json_add_boolean reflection 0 json_close_object else - json_add_object "" + json_add_object '' json_add_string type rule json_add_string src lan - json_add_string dest "*" - json_add_string proto "tcp udp" - json_add_string dest_port "$c" + json_add_string dest '*' + json_add_string proto 'tcp udp' + json_add_string dest_port "$p" json_add_string target REJECT json_close_object fi done json_close_array procd_close_data - procd_close_instance fi - if [ -n "$(uci -q changes dhcp)" ]; then - uci -q commit dhcp - [ -x /etc/init.d/dnsmasq ] && /etc/init.d/dnsmasq restart >/dev/null 2>&1 + procd_close_instance + + if [ "$?" ]; then + config_get listen_addr "$cfg" 'listen_addr' '127.0.0.1' + config_get listen_port "$cfg" 'listen_port' "$port" + if [ "$dnsmasqConfig" = '*' ]; then + config_load 'dhcp' + config_foreach dnsmasq_doh_server 'dnsmasq' 'add' "${listen_addr}" "${listen_port}" + elif [ -n "$dnsmasqConfig" ]; then + for i in $dnsmasqConfig; do + if [ -n "$(uci -q get "dhcp.@dnsmasq[$i]")" ]; then + dnsmasq_doh_server "@dnsmasq[$i]" 'add' "${listen_addr}" "${listen_port}" + elif [ -n "$(uci -q get "dhcp.${i}")" ]; then + dnsmasq_doh_server "${i}" 'add' "${listen_addr}" "${listen_port}" + fi + done + fi + output_ok + port="$((port+1))" + forceDNS=0 + else + output_fail fi } +start_service() { + local port=5053 + output "Starting $serviceName " + config_load "$packageName" + config_get dnsmasqConfig 'config' 'update_dnsmasq_config' '*' + config_get_bool forceDNS 'config' 'force_dns' '1' + config_get forceDNSPorts 'config' 'force_dns_port' '53 853' + dhcp_backup 'create' + config_load "$packageName" + config_foreach start_instance "$packageName" + if [ -n "$(uci -q changes dhcp)" ]; then + uci -q commit dhcp + dnsmasq_restart + fi + output "\\n" +} + stop_service() { - config_load 'https-dns-proxy' + local s=0 + output "Stopping $serviceName " + config_load "$packageName" config_get dnsmasqConfig 'config' 'update_dnsmasq_config' '*' dhcp_backup 'restore' if [ -n "$(uci -q changes dhcp)" ]; then uci -q commit dhcp - [ -x /etc/init.d/dnsmasq ] && /etc/init.d/dnsmasq restart >/dev/null 2>&1 + dnsmasq_restart || s=1 fi +# shellcheck disable=SC2015 + [ "$s" -eq 0 ] && output_okn || output_failn } -service_triggers() { - procd_add_config_trigger "config.change" "https-dns-proxy" /etc/init.d/https-dns-proxy reload +# shellcheck disable=SC1091 +service_triggers() { + local iface + . /lib/functions/network.sh + network_flush_cache + network_find_wan iface + iface="${iface:-wan}" + if [ -n "$iface" ]; then + procd_add_interface_trigger "interface.*" "$iface" "/etc/init.d/${packageName}" restart + fi + procd_add_config_trigger "config.change" "$packageName" "/etc/init.d/${packageName}" restart } service_started() { procd_set_config_changed firewall; } service_stopped() { procd_set_config_changed firewall; } -dnsmasq_add_doh_server() { - local cfg="$1" address="$2" port="$3" - case $address in - 0.0.0.0|::ffff:0.0.0.0) address='127.0.0.1';; - ::) address='::1';; +dnsmasq_doh_server() { + local cfg="$1" param="$2" address="${3:-127.0.0.1}" port="$4" i + case "$param" in + add) + if [ "$forceDNS" -ne 0 ]; then + for i in $canaryDomains; do + uci_add_list_if_new "dhcp.${cfg}.server" "/${i}/" + done + fi + case $address in + 0.0.0.0|::ffff:0.0.0.0) address='127.0.0.1';; + ::) address='::1';; + esac + uci_add_list_if_new "dhcp.${cfg}.server" "${address}#${port}" + ;; + remove) + eval "$(ubus call service list "{ 'verbose': true, 'name': '$packageName' }" | jsonfilter -F '# ' -e 'TUPLES=@[*].instances[*].command[4,6]')" + for i in $TUPLES; do + uci -q del_list "dhcp.${cfg}.server=${i}" + done + ;; esac - uci -q del_list "dhcp.${cfg}.server=${address}#${port}" - uci -q add_list "dhcp.${cfg}.server=${address}#${port}" } dnsmasq_create_server_backup() { - local cfg="$1" - local i + local cfg="$1" i uci -q get "dhcp.${cfg}" >/dev/null || return 1 if ! uci -q get "dhcp.${cfg}.doh_backup_noresolv" >/dev/null; then if [ -z "$(uci -q get "dhcp.${cfg}.noresolv")" ]; then @@ -196,23 +292,22 @@ dnsmasq_create_server_backup() { } dnsmasq_restore_server_backup() { - local cfg="$1" - local i + local cfg="$1" i uci -q get "dhcp.${cfg}" >/dev/null || return 0 if uci -q get "dhcp.${cfg}.doh_backup_noresolv" >/dev/null; then if [ "$(uci -q get "dhcp.${cfg}.doh_backup_noresolv")" = "0" ]; then uci -q set "dhcp.${cfg}.noresolv=0" - else + else uci -q del "dhcp.${cfg}.noresolv" fi uci -q del "dhcp.${cfg}.doh_backup_noresolv" fi if uci -q get "dhcp.${cfg}.doh_backup_server" >/dev/null; then - uci -q del "dhcp.${cfg}.server" + dnsmasq_doh_server "$cfg" 'remove' for i in $(uci -q get "dhcp.${cfg}.doh_backup_server"); do - uci -q add_list "dhcp.${cfg}.server=$i" + uci_add_list_if_new "dhcp.${cfg}.server" "$i" done - uci -q del "dhcp.${cfg}.doh_backup_server" + uci -q del "dhcp.${cfg}.doh_backup_server" fi } @@ -225,8 +320,11 @@ dhcp_backup() { config_foreach dnsmasq_create_server_backup 'dnsmasq' elif [ -n "$dnsmasqConfig" ]; then for i in $dnsmasqConfig; do - dnsmasq_create_server_backup "@dnsmasq[${i}]" || \ + if [ -n "$(uci -q get "dhcp.@dnsmasq[$i]")" ]; then + dnsmasq_create_server_backup "@dnsmasq[$i]" + elif [ -n "$(uci -q get "dhcp.${i}")" ]; then dnsmasq_create_server_backup "$i" + fi done fi ;; diff --git a/https-dns-proxy/patches/010-fix-cmakelists.patch b/https-dns-proxy/patches/010-fix-cmakelists.patch new file mode 100644 index 000000000..106142579 --- /dev/null +++ b/https-dns-proxy/patches/010-fix-cmakelists.patch @@ -0,0 +1,15 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -21,9 +21,9 @@ if(NOT CMAKE_BUILD_TYPE) + message(STATUS "Setting build type to '${CMAKE_BUILD_TYPE}' as none was specified.") + endif() + +-set(CMAKE_C_FLAGS "-Wall -Wextra --pedantic -Wno-strict-aliasing -Wno-variadic-macros") +-set(CMAKE_C_FLAGS_DEBUG "-g -DDEBUG") +-set(CMAKE_C_FLAGS_RELEASE "-O2") ++#set(CMAKE_C_FLAGS "-Wall -Wextra --pedantic -Wno-strict-aliasing -Wno-variadic-macros") ++#set(CMAKE_C_FLAGS_DEBUG "-g -DDEBUG") ++#set(CMAKE_C_FLAGS_RELEASE "-O2") + + if ((CMAKE_C_COMPILER_ID MATCHES GNU AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 9) OR + (CMAKE_C_COMPILER_ID MATCHES Clang AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 10)) diff --git a/netmaker-openwrt/LICENSE b/netmaker-openwrt/LICENSE new file mode 100644 index 000000000..472ac2361 --- /dev/null +++ b/netmaker-openwrt/LICENSE @@ -0,0 +1,8 @@ +MIT License +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/netmaker-openwrt/README.md b/netmaker-openwrt/README.md new file mode 100644 index 000000000..60b189cce --- /dev/null +++ b/netmaker-openwrt/README.md @@ -0,0 +1,111 @@ +# Netmaker-OpenWRT + +[Netmaker](https://github.com/gravitl/netmaker) is a platform for creating and managing fast, secure, and dynamic virtual overlay networks using WireGuard. This project offers OpenWRT packages for Netmaker. + +## Installing package + +Download the prebuild package and copy it onto your OpenWRT installation, preferably into the `/tmp` folder. + +Then install the ipk package file: + +```bash +opkg install netmaker_*.ipk +``` + +Now start `netclient` of Netmaker: + +```bash +/etc/init.d/netclient start +``` + +## Compiling from Sources + +To include Netmaker into your OpenWRT image or to create an `.ipk` package (equivalent to Debians .deb files), you have to build an OpenWRT image. + +Now prepare OpenWRT: + +```bash +git clone https://github.com/openwrt/openwrt +cd openwrt + +./scripts/feeds update -a +./scripts/feeds install -a +``` + +To build Netmaker for OpenWRT, you need to have Golang with OpenWRT build envirment. Then, you can insert the Netmaker package using a package feed or add the package manually. + +### Add package by feed + +A feed is the standard way packages are made available to the OpenWRT build system. + +Put this line in your feeds list file (e.g. feeds.conf.default) + +```bash +src-git netmaker http://github.com/sbilly/netmaker-openwrt.git +``` + +Update and install the new feed + +```bash +./scripts/feeds update netmaker +./scripts/feeds install netmaker +``` + +Now continue with the building packages section. + +## Building Packages + +Configure packages: + +```bash +make menuconfig +``` + +Now select the appropiate "Target System" and "Target Profile" depending on what target chipset/router you want to build for. Also mark the Netmaker package under `Network ---> VPN ---> <*> netmaker`. + +Now compile/build everything: + +```bash +make +``` + +The images and all *.ipk packages are now inside the bin/ folder, including the netmaker package. You can install the Netmaker .ipk on the target device using opkg install . + +For details please check the OpenWRT documentation. + +## Build bulk packages + +For a release, it is useful the build packages at a bulk for multiple targets: + +```shell +#!/bin/sh + +# dump-target-info.pl is used to get all targets configurations: +# https://git.openwrt.org/?p=openwrt/openwrt.git;a=blob;f=scripts/dump-target-info.pl + +./scripts/dump-target-info.pl architectures | while read pkgarch target1 rest; do + echo "CONFIG_TARGET_${target1%/*}=y" > .config + echo "CONFIG_TARGET_${target1%/*}_${target1#*/}=y" >> .config + echo "CONFIG_PACKAGE_example1=y" >> .config + + # Debug output + echo "pkgarch: $pkgarch, target1: $target1" + + make defconfig + make -j4 tools/install + make -j4 toolchain/install + + # Build package + make package/netmaker/{clean,compile} + + # Free space (optional) + rm -rf build_dir/target-* + rm -rf build_dir/toolchain-* +done +``` + +## Thanks + +- [netmaker](https://github.com/gravitl/netmaker) +- [zerotier-openwrt](https://github.com/mwarning/zerotier-openwrt) +- [openwrt-golang-package-test-feed](https://github.com/jefferyto/openwrt-golang-package-test-feed) diff --git a/netmaker-openwrt/netmaker/Makefile b/netmaker-openwrt/netmaker/Makefile new file mode 100644 index 000000000..903dd2beb --- /dev/null +++ b/netmaker-openwrt/netmaker/Makefile @@ -0,0 +1,93 @@ +# +# Copyright (C) 2019 sbilly +# +# This is free software, licensed under the MIT License. +# See /LICENSE for more information. +# + +include $(TOPDIR)/rules.mk + +PKG_NAME:=netmaker +PKG_VERSION:=0.9.4 +PKG_RELEASE:=1 + +PKG_SOURCE_PROTO:=git +PKG_SOURCE_URL:=https://github.com/gravitl/netmaker.git +PKG_SOURCE_VERSION:=e9bce264719f88c30e252ecc754d08f422f4c080 +PKG_SOURCE_DATE:=20220117 +PKG_MIRROR_HASH:=skip + +PKG_LICENSE:=MIT +PKG_LICENSE_FILES:=LICENSE +PKG_MAINTAINER:=sbilly + +PKG_BUILD_DEPENDS:=golang/host +PKG_BUILD_PARALLEL:=1 +PKG_USE_MIPS16:=0 + +GO_PKG:=github.com/gravitl/netmaker +GO_PKG_INSTALL_EXTRA:=extra/file extra/dir +GO_PKG_EXCLUDES:=excluded +GO_PKG_LDFLAGS:=-s -w + +include $(INCLUDE_DIR)/package.mk +include $(TOPDIR)/feeds/packages/lang/golang/golang-package.mk + +define Package/netmaker +$(call Package/netmaker/Default) +$(call GoPackage/GoSubMenu) + SECTION:=net + CATEGORY:=Network + SUBMENU:=VPN +endef + +define Package/netmaker/Default + TITLE:=Netmaker for OpenWRT + URL:=https://github.com/gravitl/netmaker + DEPENDS:=$(GO_ARCH_DEPENDS) + MAINTAINER:=sbilly +endef + +define Package/netmaker/Default/description +Netmaker is a platform for creating and managing fast, secure, and +dynamic virtual overlay networks using WireGuard. This project offers +OpenWRT packages for Netmaker. +endef + +define Package/netmaker/description +$(call Package/netmaker/Default/description) + +This package contains the binaries. +endef + +define Package/netmaker-dev + TITLE+= (source files) + SECTION:=net + CATEGORY:=Network + SUBMENU:=VPN + PKGARCH:=all +endef + +define Package/netmaker-dev/description +$(call Package/netmaker/Default/description) + +This package provides the source files. +endef + +define Package/netmaker/install + $(INSTALL_DIR) $(1)/etc/netclient/ + $(INSTALL_DIR) $(1)/etc/netclient/config + $(INSTALL_DIR) $(1)/etc/systemd/ + $(INSTALL_DIR) $(1)/etc/systemd/system + $(INSTALL_DIR) $(1)/usr/bin + $(INSTALL_BIN) $(GO_PKG_BUILD_BIN_DIR)/netmaker $(1)/usr/bin/ + $(INSTALL_BIN) $(GO_PKG_BUILD_BIN_DIR)/netclient $(1)/usr/bin/ + $(CP) ./root/* $(1)/ + $(LN) netclient $(1)/etc/netclient/netclient +endef + +$(eval $(call GoBinPackage,netmaker)) +$(eval $(call BuildPackage,netmaker)) + +$(eval $(call GoSrcPackage,netmaker-dev)) +$(eval $(call BuildPackage,netmaker-dev)) diff --git a/netmaker-openwrt/netmaker/root/etc/init.d/netclient b/netmaker-openwrt/netmaker/root/etc/init.d/netclient new file mode 100644 index 000000000..c04977f24 --- /dev/null +++ b/netmaker-openwrt/netmaker/root/etc/init.d/netclient @@ -0,0 +1,42 @@ +#!/bin/sh /etc/rc.common +#Created by oycol + +EXTRA_COMMANDS="status" +EXTRA_HELP=" status Check service is running" +START=99 + +LOG_FILE="/tmp/netclient.logs" + +start() { + mkdir -p /etc/netclient/config + mkdir -p /etc/systemd/system + + if [ ! -f "${LOG_FILE}" ];then + touch "${LOG_FILE}" + fi + local PID=$(ps|grep "netclient checkin -n all"|grep -v grep|awk '{print $1}') + if [ "${PID}" ];then + echo "service is running" + return + fi + /bin/sh -c "while [ 1 ]; do netclient checkin -n all >> ${LOG_FILE} 2>&1;sleep 15;\ + if [ $(ls -l ${LOG_FILE}|awk '{print $5}') -gt 10240000 ];then tar zcf "${LOG_FILE}.tar" -C / "tmp/netclient.logs" && > $LOG_FILE;fi;done &" + echo "start" +} + +stop() { + local PID=$(ps|grep "netclient checkin -n all"|grep -v grep|awk '{print $1}') + if [ "${PID}" ];then + kill "${PID}" + fi + echo "stop" +} + +status() { + local PID=$(ps|grep "netclient checkin -n all"|grep -v grep|awk '{print $1}') + if [ "${PID}" ];then + echo -e "netclient[${PID}] is running \n" + else + echo -e "netclient is not running \n" + fi +} diff --git a/netmaker-openwrt/scripts/build_ipk.sh b/netmaker-openwrt/scripts/build_ipk.sh new file mode 100644 index 000000000..1ee25de06 --- /dev/null +++ b/netmaker-openwrt/scripts/build_ipk.sh @@ -0,0 +1,177 @@ +#!/bin/bash + +# setting working directory +WORK_DIR="/home/user" + +# setting branch +if [ "${OPENWRT_BRANCH}" = "" ] +then + DEFAULT_OPENWRT_BRANCH="openwrt-21.02" +else + DEFAULT_OPENWRT_BRANCH="${OPENWRT_BRANCH}" +fi + +download_openwrt() { + cd ${WORK_DIR} + + # pull code + if [ ! -d "openwrt" ]; then + git clone https://git.openwrt.org/openwrt/openwrt.git + fi +} + +change_openwrt_branch() { + cd ${WORK_DIR}/openwrt + + if [ "${1}" = "" ] + then + echo "Building ${DEFAULT_OPENWRT_BRANCH}" + git checkout -B ${DEFAULT_OPENWRT_BRANCH} origin/${DEFAULT_OPENWRT_BRANCH} + else + echo "Building ${1}" + git checkout -B ${1} origin/${1} + fi +} + +init_openwrt_branch() { + cd ${WORK_DIR}/openwrt + + git stash + git pull --all + git pull --tags +} + +init_openwrt_link() { + cd ${WORK_DIR}/openwrt + + sudo chown 1000:1000 /src -R + + mkdir -p /src/dl + mkdir -p /src/staging_dir + mkdir -p /src/build_dir + mkdir -p /src/tmp + mkdir -p /src/bin + + ln -s /src/dl ${WORK_DIR}/openwrt/dl + ln -s /src/staging_dir ${WORK_DIR}/openwrt/staging_dir + ln -s /src/build_dir ${WORK_DIR}/openwrt/build_dir + ln -s /src/tmp ${WORK_DIR}/openwrt/tmp +} + +update_install_openwrt_feeds() { + cd ${WORK_DIR}/openwrt + + ./scripts/feeds update -a + ./scripts/feeds install -a +} + +openwrt_init_config() { + cd ${WORK_DIR}/openwrt + + echo "CONFIG_TARGET_x86=y" > ${WORK_DIR}/openwrt/.config + echo "CONFIG_TARGET_x86_64=y" >> ${WORK_DIR}/openwrt/.config +} + +openwrt_make_build_env() { + cd ${WORK_DIR}/openwrt + + make defconfig + make -j4 download + make -j4 tools/install + make -j4 toolchain/install +} + +openwrt_make() { + cd ${WORK_DIR}/openwrt + + make -j4 +} + +openwrt_install_netmaker_feeds() { + cd ${WORK_DIR}/openwrt + + echo "src-git netmaker http://github.com/sbilly/netmaker-openwrt.git" >> feeds.conf.default + + ./scripts/feeds update netmaker + ./scripts/feeds install netmaker +} + +openwrt_install_package_netmaker_config() { + cd ${WORK_DIR}/openwrt + + echo "CONFIG_FEED_netmaker=y" >> ${WORK_DIR}/openwrt/.config + echo "CONFIG_PACKAGE_netmaker=m" >> ${WORK_DIR}/openwrt/.config + echo "CONFIG_PACKAGE_netmaker-dev=m" >> ${WORK_DIR}/openwrt/.config +} + + +openwrt_patch_golang_host() { + cd ${WORK_DIR}/openwrt + echo "patching ${1}" + + if [ "${1}" = "openwrt-19.07" ] + then + sed -i 's/5fb43171046cf8784325e67913d55f88a683435071eef8e9da1aa8a1588fcf5d/2255eb3e4e824dd7d5fcdc2e7f84534371c186312e546fb1086a34c17752f431/g' ${WORK_DIR}/openwrt/feeds/packages/lang/golang/golang/Makefile + sed -i 's/1.13/1.17/g' ${WORK_DIR}/openwrt/feeds/packages/lang/golang/golang-version.mk + sed -i 's/15/2/g' ${WORK_DIR}/openwrt/feeds/packages/lang/golang/golang-version.mk + fi + + if [ "${1}" = "openwrt-18.06" ] + then + sed -i 's/6faf74046b5e24c2c0b46e78571cca4d65e1b89819da1089e53ea57539c63491/2255eb3e4e824dd7d5fcdc2e7f84534371c186312e546fb1086a34c17752f431/g' ${WORK_DIR}/openwrt/feeds/packages/lang/golang/golang/Makefile + sed -i 's/1.10/1.17/g' ${WORK_DIR}/openwrt/feeds/packages/lang/golang/golang-version.mk + sed -i 's/8/2/g' ${WORK_DIR}/openwrt/feeds/packages/lang/golang/golang-version.mk + fi +} + +openwrt_make_netmaker_package() { + cd ${WORK_DIR}/openwrt + + make defconfig + make toolchain/gcc/final/compile + make package/netmaker/clean + find ./ -type d | xargs -n1 sudo chmod 755 -R + make package/netmaker/compile V=s +} + + +openwrt_copy_pacage() { + echo ${1} + echo > /tmp/copy.sh + + cd ${WORK_DIR}/openwrt/bin/packages/x86_64/netmaker/ + + for ipk in ./*.ipk + do + if [ -f "$ipk" ] + then + echo ${ipk} | gawk -F".ipk" -v BRANCH=${1} '{ print "cp -rfv "$0" /src/bin/"$1"-"BRANCH".ipk" }' >> /tmp/copy.sh + fi + done + + /bin/bash /tmp/copy.sh +} + +download_openwrt + +change_openwrt_branch ${DEFAULT_OPENWRT_BRANCH} + +init_openwrt_branch + +init_openwrt_link + +openwrt_install_netmaker_feeds + +update_install_openwrt_feeds + +openwrt_init_config + +openwrt_install_package_netmaker_config + +openwrt_patch_golang_host ${DEFAULT_OPENWRT_BRANCH} + +openwrt_make_netmaker_package + +openwrt_copy_pacage ${DEFAULT_OPENWRT_BRANCH} + +ls -alF ${WORK_DIR}/openwrt/bin/ /src/bin diff --git a/shortcut-fe/.gitignore b/shortcut-fe/.gitignore new file mode 100644 index 000000000..958088547 --- /dev/null +++ b/shortcut-fe/.gitignore @@ -0,0 +1,5 @@ +# Ouptut files + +*.o +*.s + diff --git a/shortcut-fe/Kconfig b/shortcut-fe/Kconfig new file mode 100644 index 000000000..487f1e065 --- /dev/null +++ b/shortcut-fe/Kconfig @@ -0,0 +1,14 @@ +# +# Shortcut forwarding engine +# + +config SHORTCUT_FE + tristate "Shortcut Forwarding Engine" + depends on NF_CONNTRACK + ---help--- + Shortcut is a fast in-kernel packet forwarding engine. + + To compile this code as a module, choose M here: the module will be + called shortcut-fe. + + If unsure, say N. diff --git a/shortcut-fe/Makefile b/shortcut-fe/Makefile new file mode 100644 index 000000000..3b1ceaa44 --- /dev/null +++ b/shortcut-fe/Makefile @@ -0,0 +1,23 @@ +# +# Makefile for Shortcut FE. +# + +obj-m += shortcut-fe.o + +ifdef SFE_SUPPORT_IPV6 +obj-m += shortcut-fe-ipv6.o +endif + +obj-m += shortcut-fe-cm.o + +shortcut-fe-objs := \ + sfe_ipv4.o + +ifdef SFE_SUPPORT_IPV6 +shortcut-fe-ipv6-objs := \ + sfe_ipv6.o +endif + +shortcut-fe-cm-objs := \ + sfe_cm.o + diff --git a/shortcut-fe/README b/shortcut-fe/README new file mode 100644 index 000000000..1bf1cc255 --- /dev/null +++ b/shortcut-fe/README @@ -0,0 +1,122 @@ +Shortcut Forwarding Engine +-------------------------- + +Welcome to "Shortcut" :-) + +Here's a quick FAQ: + + +Q) What is Shortcut? + +A) Shortcut is an in-Linux-kernel IP packet forwarding engine. It's designed +to offer very high speed IP packet forwarding based on IP connection tracking. +It's dramatically faster than the standard netfilter-based NAT forwarding path +but is designed to synchronise state back to netfilter/conntrack so that it +doesn't need to deal with all of the complexities of special cases. + + +Q) What versions of IP does it support? + +A) The current version only supports IPv4 but will be extended to support IPv6 in +the future. + + +Q) What transport protocols does it support? + +A) TCP and UDP. It also knows enough about ICMP to spot ICMP error messages +related to TCP and UDP and handle things accordingly. + + +Q) Is there a design spec for this software? + +A) Not at the moment. I'll write one when I get more time. The code is +intended to be a good tutorial though - it's very heavily commented. If you +find yourself reading something and not understanding it then I take that to +mean I've probably not done a sufficently good job of explaining what it's +doing in the comments. Let me know - I will try to fix it :-) + + +Q) Why was it written? + +A) It was written as a demonstration of what can be done to provide high +performance forwarding inside the kernel. There were two initial motivations: + +1) To provide a platform to enable research into how QoS analysis systems can +offload work and avoid huge Linux overheads. + +2) To provide a tool to investigate the behaviour of various processors, SoCs +and software sets so that we can characterize and design new network processor +SoCs. + + +Q) How much faster is it than the Linux kernel forwarding path? + +A) At the time of pushing this to github it's been tested on a QCA AP135. +This has a Scorpion (QCA Scopion, not the QMC one :-)) SoC, QCA9550. The +SoC's processor is a MIPS74K running at 720 MHz and with a DDR2 memory +subsystem that offers a peak of 600 MT/s (16-bit transfers). + +Running IPv4 NAT forwarding of UDP between the board's 2 GMAC ports and +using a SmartBits 200 as a traffic generator Linux is able to forward 70k PPS. +Once the SFE code is invoked this will increase to 350k PPS! + +There's also a slightly hacky mode which causes SFE to bypass the Linux +bridge layer, but this isn't really ready for use because it doesn't have +sufficient MAC address checks or integration of statistics back to the +Ethernet bridge, but that runs at 436k PPS. + + +Q) Are there any diagnostics? + +A) Yes, this is a research tool after all! There's a complex way to do this +that's more general purpose and a simple one - here's the simple one: + + mknod /dev/sfe c 253 0 + +The file /dev/sfe is an XML-ish output and provides details of all the +network connections currently being offloaded. It also reports the numbers +of packets that took various "exception" paths within the code. In addition +it provides a summary of the number of connections, attempts to accelerate +connections, cancel accelerations, etc. It also reports the numbers of +packets that were forwarded and not forwarded by the engine and has some +stats on the effectiveness of the hashing algorithm it uses. + + +Q) How does the code interact with Linux? + +A) There are four minor patches required to make this software run with +Linux. These are currently against a 3.3.8 or 3.4.0 kernel: + +* (net/core/dev.c) adds a hook to allow packets to be extracted out. + +* (net/netfilter/nf_conntrack_proto_tcp.c) exposes a state variable inside + netfilter that's necessary to enable TCP sequence and ACK checking within + the offload path. Note that this specific patch is against the QCA QSDK + patched version of 3.3.8 - there's a slightly braindead "performance" + patch in that kernel, courtesy of the OpenWrt community that makes the + Linux forwarding path slightly faster at the expense of losing + functionality :-( + +* (net/Kconfig) adds the shortcut-fe option. + +* (net/Makefile) adds the shortcut-fe build support. + +Once these are applied and the module is loaded then everything else +is automatic :-) The patches are in this git repo. + + +Q) Are any of the pieces reused from other projects? + +A) Yes! Some of the forwarding concepts are reused from the Ubicom Network +Accelerator that morphed into part of the Akronite NSS. This code has all +been substantially changed though to accomodate Linux's needs. + +There are also some pieces that I borrowed from the QCA "FastNAT" software +written by Xiaoping Fan . Xiaoping's code was the +first actual demonstration within QCA that this in-kernel concept could yield +signficant performance gains. + + +Enjoy! +Dave Hudson + diff --git a/shortcut-fe/fast-classifier/fast-classifier.c b/shortcut-fe/fast-classifier/fast-classifier.c new file mode 100644 index 000000000..d79404cba --- /dev/null +++ b/shortcut-fe/fast-classifier/fast-classifier.c @@ -0,0 +1,1892 @@ +/* + * fast-classifier.c + * Shortcut forwarding engine connection manager. + * fast-classifier + * + * Copyright (c) 2013-2018 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "fast-classifier.h" + +typedef enum fast_classifier_exception { + FAST_CL_EXCEPTION_PACKET_BROADCAST, + FAST_CL_EXCEPTION_PACKET_MULTICAST, + FAST_CL_EXCEPTION_NO_IIF, + FAST_CL_EXCEPTION_NO_CT, + FAST_CL_EXCEPTION_CT_NO_TRACK, + FAST_CL_EXCEPTION_CT_NO_CONFIRM, + FAST_CL_EXCEPTION_CT_IS_ALG, + FAST_CL_EXCEPTION_IS_IPV4_MCAST, + FAST_CL_EXCEPTION_IS_IPV6_MCAST, + FAST_CL_EXCEPTION_TCP_NOT_ASSURED, + FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED, + FAST_CL_EXCEPTION_UNKNOW_PROTOCOL, + FAST_CL_EXCEPTION_NO_SRC_DEV, + FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV, + FAST_CL_EXCEPTION_NO_DEST_DEV, + FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV, + FAST_CL_EXCEPTION_NO_BRIDGE, + FAST_CL_EXCEPTION_LOCAL_OUT, + FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION, + FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL, + FAST_CL_EXCEPTION_CT_DESTROY_MISS, + FAST_CL_EXCEPTION_MAX +} fast_classifier_exception_t; + +static char *fast_classifier_exception_events_string[FAST_CL_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT", + "WAIT_FOR_ACCELERATION", + "UPDATE_PROTOCOL_FAIL", + "CT_DESTROY_MISS", +}; + +/* + * Per-module structure. + */ +struct fast_classifier { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_fast_classifier; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[FAST_CL_EXCEPTION_MAX]; +}; + +static struct fast_classifier __sc; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { + .type = NLA_UNSPEC, + .len = sizeof(struct fast_classifier_tuple) + }, +}; + +static struct genl_multicast_group fast_classifier_genl_mcgrp[] = { + { + .name = FAST_CLASSIFIER_GENL_MCGRP, + }, +}; + +static struct genl_family fast_classifier_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = FAST_CLASSIFIER_GENL_HDRSIZE, + .name = FAST_CLASSIFIER_GENL_NAME, + .version = FAST_CLASSIFIER_GENL_VERSION, + .maxattr = FAST_CLASSIFIER_A_MAX, +}; + +static int fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info); +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, struct netlink_callback *cb); + +static struct genl_ops fast_classifier_gnl_ops[] = { + { + .cmd = FAST_CLASSIFIER_C_OFFLOAD, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = fast_classifier_offload_genl_msg, + .dumpit = NULL, + }, + { + .cmd = FAST_CLASSIFIER_C_OFFLOADED, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, + { + .cmd = FAST_CLASSIFIER_C_DONE, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, +}; + +static atomic_t offload_msgs = ATOMIC_INIT(0); +static atomic_t offload_no_match_msgs = ATOMIC_INIT(0); +static atomic_t offloaded_msgs = ATOMIC_INIT(0); +static atomic_t done_msgs = ATOMIC_INIT(0); + +static atomic_t offloaded_fail_msgs = ATOMIC_INIT(0); +static atomic_t done_fail_msgs = ATOMIC_INIT(0); + +/* + * Accelerate incoming packets destined for bridge device + * If a incoming packet is ultimatly destined for + * a bridge device we will first see the packet coming + * from the phyiscal device, we can skip straight to + * processing the packet like it came from the bridge + * for some more performance gains + * + * This only works when the hook is above the bridge. We + * only implement ingress for now, because for egress we + * want to have the bridge devices qdiscs be used. + */ +static bool skip_to_bridge_ingress; + +/* + * fast_classifier_incr_exceptions() + * increase an exception counter. + */ +static inline void fast_classifier_incr_exceptions(fast_classifier_exception_t except) +{ + struct fast_classifier *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * fast_classifier_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int fast_classifier_recv(struct sk_buff *skb) +{ + struct net_device *dev; + struct net_device *master_dev = NULL; + int ret = 0; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * Process packet like it arrived on the bridge device + */ + if (skip_to_bridge_ingress && + (dev->priv_flags & IFF_BRIDGE_PORT)) { + master_dev = sfe_dev_get_master(dev); + if (!master_dev) { + DEBUG_WARN("master dev is NULL %s\n", dev->name); + goto rx_exit; + } + dev = master_dev; + } + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv4_recv(dev, skb); + + } else if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv6_recv(dev, skb); + + } else { + DEBUG_TRACE("not IP packet\n"); + } + +rx_exit: + if (master_dev) { + dev_put(master_dev); + } + + return ret; +} + +/* + * fast_classifier_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool fast_classifier_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, bool is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this works when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + + rcu_read_lock(); + neigh = sfe_dst_get_neighbour(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", addr); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr); + } + + return false; +} + +static DEFINE_SPINLOCK(sfe_connections_lock); + +struct sfe_connection { + struct hlist_node hl; + struct sfe_connection_create *sic; + struct nf_conn *ct; + int hits; + int offload_permit; + int offloaded; + bool is_v4; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; + +static int sfe_connections_size; + +#define FC_CONN_HASH_ORDER 13 +static DEFINE_HASHTABLE(fc_conn_ht, FC_CONN_HASH_ORDER); + +static u32 fc_conn_hash(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, bool is_v4) +{ + u32 idx, cnt = ((is_v4 ? sizeof(saddr->ip) : sizeof(saddr->ip6))/sizeof(u32)); + u32 hash = 0; + + for (idx = 0; idx < cnt; idx++) { + hash ^= ((u32 *)saddr)[idx] ^ ((u32 *)daddr)[idx]; + } + + return hash ^ (sport | (dport << 16)); +} + +/* + * fast_classifier_update_protocol() + * Update sfe_ipv4_create struct with new protocol information before we offload + */ +static int fast_classifier_update_protocol(struct sfe_connection_create *p_sic, struct nf_conn *ct) +{ + switch (p_sic->protocol) { + case IPPROTO_TCP: + p_sic->src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + p_sic->src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + p_sic->src_td_end = ct->proto.tcp.seen[0].td_end; + p_sic->src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + p_sic->dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + p_sic->dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + p_sic->dest_td_end = ct->proto.tcp.seen[1].td_end; + p_sic->dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; + + if (nf_ct_tcp_no_window_check + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + p_sic->flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock(&ct->lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &p_sic->src_ip, ntohs(p_sic->src_port), + &p_sic->dest_ip, ntohs(p_sic->dest_port)); + return 0; + } + spin_unlock(&ct->lock); + break; + + case IPPROTO_UDP: + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", p_sic->protocol); + return 0; + } + + return 1; +} + +/* fast_classifier_send_genl_msg() + * Function to send a generic netlink message + */ +static void fast_classifier_send_genl_msg(int msg, struct fast_classifier_tuple *fc_msg) +{ + struct sk_buff *skb; + int rc; + int buf_len; + int total_len; + void *msg_head; + + /* + * Calculate our packet payload size. + * Start with our family header. + */ + buf_len = fast_classifier_gnl_family.hdrsize; + + /* + * Add the nla_total_size of each attribute we're going to nla_put(). + */ + buf_len += nla_total_size(sizeof(*fc_msg)); + + /* + * Lastly we need to add space for the NL message header since + * genlmsg_new only accounts for the GENL header and not the + * outer NL header. To do this, we use a NL helper function which + * calculates the total size of a netlink message given a payload size. + * Note this value does not include the GENL header, but that's + * added automatically by genlmsg_new. + */ + total_len = nlmsg_total_size(buf_len); + skb = genlmsg_new(total_len, GFP_ATOMIC); + if (!skb) + return; + + msg_head = genlmsg_put(skb, 0, 0, &fast_classifier_gnl_family, 0, msg); + if (!msg_head) { + nlmsg_free(skb); + return; + } + + rc = nla_put(skb, FAST_CLASSIFIER_A_TUPLE, sizeof(struct fast_classifier_tuple), fc_msg); + if (rc != 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(3, 19 , 0)) + rc = genlmsg_end(skb, msg_head); + if (rc < 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } +#else + genlmsg_end(skb, msg_head); + +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + rc = genlmsg_multicast(&fast_classifier_gnl_family, skb, 0, 0, GFP_ATOMIC); +#else + rc = genlmsg_multicast(skb, 0, fast_classifier_genl_mcgrp[0].id, GFP_ATOMIC); +#endif + switch (msg) { + case FAST_CLASSIFIER_C_OFFLOADED: + if (rc == 0) { + atomic_inc(&offloaded_msgs); + } else { + atomic_inc(&offloaded_fail_msgs); + } + break; + case FAST_CLASSIFIER_C_DONE: + if (rc == 0) { + atomic_inc(&done_msgs); + } else { + atomic_inc(&done_fail_msgs); + } + break; + default: + DEBUG_ERROR("fast-classifer: Unknown message type sent!\n"); + break; + } + + DEBUG_TRACE("Notify NL message %d ", msg); + if (fc_msg->ethertype == AF_INET) { + DEBUG_TRACE("sip=%pI4 dip=%pI4 ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + } else { + DEBUG_TRACE("sip=%pI6 dip=%pI6 ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + } + DEBUG_TRACE("protocol=%d sport=%d dport=%d smac=%pM dmac=%pM\n", + fc_msg->proto, fc_msg->sport, fc_msg->dport, fc_msg->smac, fc_msg->dmac); +} + +/* + * fast_classifier_find_conn() + * find a connection object in the hash table + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip, daddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_sb_find_conn() + * find a connection object in the hash table according to information of packet + * if not found, reverse the tuple and try again. + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_sb_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port_xlate == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, daddr, is_v4)) { + return conn; + } + } + + /* + * Reverse the tuple and try again + */ + key = fc_conn_hash(daddr, saddr, dport, sport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, node, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == dport && + p_sic->dest_port_xlate == sport && + sfe_addr_equal(&p_sic->src_ip, daddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, saddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_add_conn() + * add a connection object in the hash table if no duplicate + * @conn connection to add + * @return conn if successful, NULL if duplicate + */ +static struct sfe_connection * +fast_classifier_add_conn(struct sfe_connection *conn) +{ + struct sfe_connection_create *sic = conn->sic; + u32 key; + + spin_lock_bh(&sfe_connections_lock); + if (fast_classifier_find_conn(&sic->src_ip, &sic->dest_ip, sic->src_port, + sic->dest_port, sic->protocol, conn->is_v4)) { + spin_unlock_bh(&sfe_connections_lock); + return NULL; + } + + key = fc_conn_hash(&sic->src_ip, &sic->dest_ip, + sic->src_port, sic->dest_port, conn->is_v4); + + hash_add(fc_conn_ht, &conn->hl, key); + sfe_connections_size++; + spin_unlock_bh(&sfe_connections_lock); + + DEBUG_TRACE(" -> adding item to sfe_connections, new size: %d\n", sfe_connections_size); + + if (conn->is_v4) { + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + } else { + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + } + + return conn; +} + +/* + * fast_classifier_offload_genl_msg() + * Called from user space to offload a connection + */ +static int +fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *na; + struct fast_classifier_tuple *fc_msg; + struct sfe_connection *conn; + + na = info->attrs[FAST_CLASSIFIER_A_TUPLE]; + fc_msg = nla_data(na); + + if (fc_msg->ethertype == AF_INET) { + DEBUG_TRACE("want to offload: %d-%d, %pI4, %pI4, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + } else { + DEBUG_TRACE("want to offload: %d-%d, %pI6, %pI6, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + } + + spin_lock_bh(&sfe_connections_lock); + conn = fast_classifier_sb_find_conn((sfe_ip_addr_t *)&fc_msg->src_saddr, + (sfe_ip_addr_t *)&fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->proto, + (fc_msg->ethertype == AF_INET)); + if (!conn) { + spin_unlock_bh(&sfe_connections_lock); + DEBUG_TRACE("REQUEST OFFLOAD NO MATCH\n"); + atomic_inc(&offload_no_match_msgs); + return 0; + } + + conn->offload_permit = 1; + spin_unlock_bh(&sfe_connections_lock); + atomic_inc(&offload_msgs); + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + return 0; +} + +/* + * fast_classifier_nl_genl_msg_DUMP() + * ignore fast_classifier_messages OFFLOADED and DONE + */ +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return 0; +} + +/* auto offload connection once we have this many packets*/ +static int offload_at_pkts = 128; + +/* + * fast_classifier_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int fast_classifier_post_routing(struct sk_buff *skb, bool is_v4) +{ + int ret; + struct sfe_connection_create sic; + struct sfe_connection_create *p_sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_dev_tmp; + struct net_device *dest_dev_tmp; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + struct sfe_connection *conn; + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + memset(&sic, 0, sizeof(sic)); + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + if (is_v4) { + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + } else { + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + } + + /* + * If we already have this connection in our list, skip it + * XXX: this may need to be optimized + */ + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port, sic.protocol, is_v4); + if (conn) { + conn->hits++; + + if (!conn->offloaded) { + if (conn->offload_permit || conn->hits >= offload_at_pkts) { + DEBUG_TRACE("OFFLOADING CONNECTION, TOO MANY HITS\n"); + + if (fast_classifier_update_protocol(conn->sic, conn->ct) == 0) { + spin_unlock_bh(&sfe_connections_lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL); + DEBUG_TRACE("UNKNOWN PROTOCOL OR CONNECTION CLOSING, SKIPPING\n"); + return NF_ACCEPT; + } + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + spin_unlock_bh(&sfe_connections_lock); + + ret = is_v4 ? sfe_ipv4_create_rule(conn->sic) : sfe_ipv6_create_rule(conn->sic); + if ((ret == 0) || (ret == -EADDRINUSE)) { + struct fast_classifier_tuple fc_msg; + + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&sic.src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&sic.dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&sic.src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&sic.dest_ip_xlate); + } + + fc_msg.proto = sic.protocol; + fc_msg.sport = sic.src_port; + fc_msg.dport = sic.dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_OFFLOADED, &fc_msg); + conn->offloaded = 1; + } + + return NF_ACCEPT; + } + } + + spin_unlock_bh(&sfe_connections_lock); + if (conn->offloaded) { + is_v4 ? sfe_ipv4_update_rule(conn->sic) : sfe_ipv6_update_rule(conn->sic); + } + + DEBUG_TRACE("FOUND, SKIPPING\n"); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION); + return NF_ACCEPT; + } + + spin_unlock_bh(&sfe_connections_lock); + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!fast_classifier_find_dev_and_mac_addr(&sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + src_dev = src_dev_tmp; + + if (!fast_classifier_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_DEV); + goto done1; + } + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + dest_dev = dest_dev_tmp; + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (skb->mark) { + DEBUG_TRACE("SKB MARK NON ZERO %x\n", skb->mark); + } + sic.mark = skb->mark; + + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (!conn) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + goto done4; + } + conn->hits = 0; + conn->offload_permit = 0; + conn->offloaded = 0; + conn->is_v4 = is_v4; + DEBUG_TRACE("Source MAC=%pM\n", sic.src_mac); + memcpy(conn->smac, sic.src_mac, ETH_ALEN); + memcpy(conn->dmac, sic.dest_mac_xlate, ETH_ALEN); + + p_sic = kmalloc(sizeof(*p_sic), GFP_ATOMIC); + if (!p_sic) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + kfree(conn); + goto done4; + } + + memcpy(p_sic, &sic, sizeof(sic)); + conn->sic = p_sic; + conn->ct = ct; + + if (!fast_classifier_add_conn(conn)) { + kfree(conn->sic); + kfree(conn); + } + + /* + * If we had bridge ports then release them too. + */ +done4: + if (dest_br_dev) { + dev_put(dest_br_dev); + } +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } +done2: + dev_put(dest_dev_tmp); +done1: + dev_put(src_dev_tmp); + + return NF_ACCEPT; +} + +/* + * fast_classifier_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, true); +} + +/* + * fast_classifier_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, false); +} + +/* + * fast_classifier_update_mark() + * updates the mark for a fast-classifier connection + */ +static void fast_classifier_update_mark(struct sfe_connection_mark *mark, bool is_v4) +{ + struct sfe_connection *conn; + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&mark->src_ip, &mark->dest_ip, + mark->src_port, mark->dest_port, + mark->protocol, is_v4); + if (conn) { + conn->sic->mark = mark->mark; + } + + spin_unlock_bh(&sfe_connections_lock); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * fast_classifier_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int fast_classifier_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int fast_classifier_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + struct sfe_connection *conn; + struct fast_classifier_tuple fc_msg; + int offloaded = 0; + bool is_v4; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + + /* + * If this is an untracked connection then we can't have any state either. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + is_v4 = true; + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + is_v4 = false; + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + return NOTIFY_DONE; + } + + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + /* + * Check for an updated mark + */ + if ((events & (1 << IPCT_MARK)) && (ct->mark != 0)) { + struct sfe_connection_mark mark; + + mark.protocol = sid.protocol; + mark.src_ip = sid.src_ip; + mark.dest_ip = sid.dest_ip; + mark.src_port = sid.src_port; + mark.dest_port = sid.dest_port; + mark.mark = ct->mark; + + is_v4 ? sfe_ipv4_mark_rule(&mark) : sfe_ipv6_mark_rule(&mark); + fast_classifier_update_mark(&mark, is_v4); + } + + /* + * We're only interested in destroy events at this point + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + if (is_v4) { + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pI4 dst_ip: %pI4, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port); + } else { + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pI6 dst_ip: %pI6, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port); + } + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port, sid.protocol, is_v4); + if (conn && conn->offloaded) { + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&conn->sic->dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&conn->sic->dest_ip_xlate); + } + + fc_msg.proto = conn->sic->protocol; + fc_msg.sport = conn->sic->src_port; + fc_msg.dport = conn->sic->dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + offloaded = 1; + } + + if (conn) { + DEBUG_TRACE("Free connection\n"); + + hash_del(&conn->hl); + sfe_connections_size--; + kfree(conn->sic); + kfree(conn); + } else { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_DESTROY_MISS); + } + + spin_unlock_bh(&sfe_connections_lock); + + is_v4 ? sfe_ipv4_destroy_rule(&sid) : sfe_ipv6_destroy_rule(&sid); + + if (offloaded) { + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_DONE, &fc_msg); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block fast_classifier_conntrack_notifier = { + .notifier_call = fast_classifier_conntrack_event, +}; +#else +static struct nf_ct_event_notifier fast_classifier_conntrack_notifier = { + .fcn = fast_classifier_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops fast_classifier_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__fast_classifier_ipv4_post_routing_hook), + SFE_IPV6_NF_POST_ROUTING_HOOK(__fast_classifier_ipv6_post_routing_hook), +}; + +/* + * fast_classifier_sync_rule() + * Synchronize a connection's state. + */ +static void fast_classifier_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Update packet count for ingress on bridge device + */ + if (skip_to_bridge_ingress) { + struct rtnl_link_stats64 nlstats; + nlstats.tx_packets = 0; + nlstats.tx_bytes = 0; + + if (sis->src_dev && IFF_EBRIDGE && + (sis->src_new_packet_count || sis->src_new_byte_count)) { + nlstats.rx_packets = sis->src_new_packet_count; + nlstats.rx_bytes = sis->src_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->src_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + if (sis->dest_dev && IFF_EBRIDGE && + (sis->dest_new_packet_count || sis->dest_new_byte_count)) { + nlstats.rx_packets = sis->dest_new_packet_count; + nlstats.rx_bytes = sis->dest_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->dest_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); + ct->timeout.expires += sis->delta_jiffies; + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * fast_classifier_device_event() + */ +static int fast_classifier_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet_event() + */ +static int fast_classifier_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet6_event() + */ +static int fast_classifier_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_get_offload_at_pkts() + */ +static ssize_t fast_classifier_get_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", offload_at_pkts); +} + +/* + * fast_classifier_set_offload_at_pkts() + */ +static ssize_t fast_classifier_set_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + offload_at_pkts = new; + + return size; +} + +/* + * fast_classifier_get_debug_info() + */ +static ssize_t fast_classifier_get_debug_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + size_t len = 0; + struct sfe_connection *conn; + u32 i; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0)) + struct hlist_node *node; +#endif + + spin_lock_bh(&sfe_connections_lock); + len += scnprintf(buf, PAGE_SIZE - len, "size=%d offload=%d offload_no_match=%d" + " offloaded=%d done=%d offloaded_fail=%d done_fail=%d\n", + sfe_connections_size, + atomic_read(&offload_msgs), + atomic_read(&offload_no_match_msgs), + atomic_read(&offloaded_msgs), + atomic_read(&done_msgs), + atomic_read(&offloaded_fail_msgs), + atomic_read(&done_fail_msgs)); + sfe_hash_for_each(fc_conn_ht, i, node, conn, hl) { + len += scnprintf(buf + len, PAGE_SIZE - len, + (conn->is_v4 ? "o=%d, p=%d [%pM]:%pI4:%u %pI4:%u:[%pM] m=%08x h=%d\n" : "o=%d, p=%d [%pM]:%pI6:%u %pI6:%u:[%pM] m=%08x h=%d\n"), + conn->offloaded, + conn->sic->protocol, + conn->sic->src_mac, + &conn->sic->src_ip, + conn->sic->src_port, + &conn->sic->dest_ip, + conn->sic->dest_port, + conn->sic->dest_mac_xlate, + conn->sic->mark, + conn->hits); + } + spin_unlock_bh(&sfe_connections_lock); + + return len; +} + +/* + * fast_classifier_get_skip_bridge_ingress() + */ +static ssize_t fast_classifier_get_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", skip_to_bridge_ingress); +} + +/* + * fast_classifier_set_skip_bridge_ingress() + */ +static ssize_t fast_classifier_set_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + skip_to_bridge_ingress = new ? 1 : 0; + + return size; +} + +/* + * fast_classifier_get_exceptions + * dump exception counters + */ +static ssize_t fast_classifier_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct fast_classifier *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < FAST_CL_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", fast_classifier_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute fast_classifier_offload_at_pkts_attr = + __ATTR(offload_at_pkts, S_IWUSR | S_IRUGO, fast_classifier_get_offload_at_pkts, fast_classifier_set_offload_at_pkts); +static const struct device_attribute fast_classifier_debug_info_attr = + __ATTR(debug_info, S_IRUGO, fast_classifier_get_debug_info, NULL); +static const struct device_attribute fast_classifier_skip_bridge_ingress = + __ATTR(skip_to_bridge_ingress, S_IWUSR | S_IRUGO, fast_classifier_get_skip_bridge_ingress, fast_classifier_set_skip_bridge_ingress); +static const struct device_attribute fast_classifier_exceptions_attr = + __ATTR(exceptions, S_IRUGO, fast_classifier_get_exceptions, NULL); + +/* + * fast_classifier_init() + */ +static int __init fast_classifier_init(void) +{ + struct fast_classifier *sc = &__sc; + int result = -1; + + printk(KERN_ALERT "fast-classifier: starting up\n"); + DEBUG_INFO("SFE CM init\n"); + + hash_init(fc_conn_ht); + + /* + * Create sys/fast_classifier + */ + sc->sys_fast_classifier = kobject_create_and_add("fast_classifier", NULL); + if (!sc->sys_fast_classifier) { + DEBUG_ERROR("failed to register fast_classifier\n"); + goto exit1; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + if (result) { + DEBUG_ERROR("failed to register offload at pkgs: %d\n", result); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + if (result) { + DEBUG_ERROR("failed to register skip bridge on ingress: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + goto exit2; + } + + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + goto exit2; + } + + sc->dev_notifier.notifier_call = fast_classifier_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = fast_classifier_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = fast_classifier_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + + /* + * Register our netfilter hooks. + */ + result = nf_register_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + /* + * Register a notifier hook to get fast notifications of expired connections. + */ + result = nf_conntrack_register_notifier(&init_net, &fast_classifier_conntrack_notifier); + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + result = genl_register_family_with_ops_groups(&fast_classifier_gnl_family, + fast_classifier_gnl_ops, + fast_classifier_genl_mcgrp); + if (result) { + DEBUG_ERROR("failed to register genl ops: %d\n", result); + goto exit5; + } +#else + result = genl_register_family(&fast_classifier_gnl_family); + if (result) { + printk(KERN_CRIT "unable to register genl family\n"); + goto exit5; + } + + result = genl_register_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result) { + printk(KERN_CRIT "unable to register ops\n"); + goto exit6; + } + + result = genl_register_mc_group(&fast_classifier_gnl_family, + fast_classifier_genl_mcgrp); + if (result) { + printk(KERN_CRIT "unable to register multicast group\n"); + goto exit6; + } +#endif + + printk(KERN_ALERT "fast-classifier: registered\n"); + + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ + BUG_ON(athrs_fast_nat_recv); + RCU_INIT_POINTER(athrs_fast_nat_recv, fast_classifier_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(fast_classifier_sync_rule); + sfe_ipv6_register_sync_rule_callback(fast_classifier_sync_rule); + return 0; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) +exit6: + genl_unregister_family(&fast_classifier_gnl_family); +#endif + +exit5: +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); + +exit4: +#endif + nf_unregister_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + +exit3: + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_offload_at_pkts_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_debug_info_attr.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_skip_bridge_ingress.attr); + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_exceptions_attr.attr); + +exit2: + kobject_put(sc->sys_fast_classifier); + +exit1: + return result; +} + +/* + * fast_classifier_exit() + */ +static void __exit fast_classifier_exit(void) +{ + struct fast_classifier *sc = &__sc; + int result = -1; + + DEBUG_INFO("SFE CM exit\n"); + printk(KERN_ALERT "fast-classifier: shutting down\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) + result = genl_unregister_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_ops\n"); + } +#endif + + result = genl_unregister_family(&fast_classifier_gnl_family); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_family\n"); + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); + +#endif + nf_unregister_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_fast_classifier); +} + +module_init(fast_classifier_init) +module_exit(fast_classifier_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/fast-classifier/fast-classifier.h b/shortcut-fe/fast-classifier/fast-classifier.h new file mode 100644 index 000000000..6b7a18cf6 --- /dev/null +++ b/shortcut-fe/fast-classifier/fast-classifier.h @@ -0,0 +1,57 @@ +/* + * User space header to send message to the fast classifier + * + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#define FAST_CLASSIFIER_GENL_VERSION (1) +#define FAST_CLASSIFIER_GENL_NAME "FC" +#define FAST_CLASSIFIER_GENL_MCGRP "FC_MCGRP" +#define FAST_CLASSIFIER_GENL_HDRSIZE (0) + +enum { + FAST_CLASSIFIER_A_UNSPEC, + FAST_CLASSIFIER_A_TUPLE, + __FAST_CLASSIFIER_A_MAX, +}; + +#define FAST_CLASSIFIER_A_MAX (__FAST_CLASSIFIER_A_MAX - 1) + +enum { + FAST_CLASSIFIER_C_UNSPEC, + FAST_CLASSIFIER_C_OFFLOAD, + FAST_CLASSIFIER_C_OFFLOADED, + FAST_CLASSIFIER_C_DONE, + __FAST_CLASSIFIER_C_MAX, +}; + +#define FAST_CLASSIFIER_C_MAX (__FAST_CLASSIFIER_C_MAX - 1) + +struct fast_classifier_tuple { + unsigned short ethertype; + unsigned char proto; + union { + struct in_addr in; + struct in6_addr in6; + } src_saddr; + union { + struct in_addr in; + struct in6_addr in6; + } dst_saddr; + unsigned short sport; + unsigned short dport; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; diff --git a/shortcut-fe/fast-classifier/nl_classifier_test.c b/shortcut-fe/fast-classifier/nl_classifier_test.c new file mode 100644 index 000000000..639417964 --- /dev/null +++ b/shortcut-fe/fast-classifier/nl_classifier_test.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#define NL_CLASSIFIER_GENL_VERSION 1 +#define NL_CLASSIFIER_GENL_FAMILY "FC" +#define NL_CLASSIFIER_GENL_GROUP "FC_MCGRP" +#define NL_CLASSIFIER_GENL_HDRSIZE 0 + +enum NL_CLASSIFIER_CMD { + NL_CLASSIFIER_CMD_UNSPEC, + NL_CLASSIFIER_CMD_ACCEL, + NL_CLASSIFIER_CMD_ACCEL_OK, + NL_CLASSIFIER_CMD_CONNECTION_CLOSED, + NL_CLASSIFIER_CMD_MAX, +}; + +enum NL_CLASSIFIER_ATTR { + NL_CLASSIFIER_ATTR_UNSPEC, + NL_CLASSIFIER_ATTR_TUPLE, + NL_CLASSIFIER_ATTR_MAX, +}; + +union nl_classifier_tuple_ip { + struct in_addr in; + struct in6_addr in6; +}; + +struct nl_classifier_tuple { + unsigned short af; + unsigned char proto; + union nl_classifier_tuple_ip src_ip; + union nl_classifier_tuple_ip dst_ip; + unsigned short sport; + unsigned short dport; + unsigned char smac[6]; + unsigned char dmac[6]; +}; + +struct nl_classifier_instance { + struct nl_sock *sock; + int family_id; + int group_id; + int stop; +}; + +struct nl_classifier_instance nl_cls_inst; + +static struct nla_policy nl_classifier_genl_policy[(NL_CLASSIFIER_ATTR_MAX+1)] = { + [NL_CLASSIFIER_ATTR_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void nl_classifier_dump_nl_tuple(struct nl_classifier_tuple *tuple) +{ + char ip_str[64]; + + printf("protocol = %s\n", (tuple->proto == IPPROTO_UDP) ? "udp" : ((tuple->proto == IPPROTO_TCP) ? "tcp" : "unknown")); + printf("source ip = %s\n", inet_ntop(tuple->af, &tuple->src_ip, ip_str, sizeof(ip_str))); + printf("destination ip = %s\n", inet_ntop(tuple->af, &tuple->dst_ip, ip_str, sizeof(ip_str))); + printf("source port = %d\n", ntohs(tuple->sport)); + printf("destination port = %d\n", ntohs(tuple->dport)); +} + +int nl_classifier_msg_recv(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[(NL_CLASSIFIER_ATTR_MAX+1)]; + + genlmsg_parse(nlh, NL_CLASSIFIER_GENL_HDRSIZE, attrs, NL_CLASSIFIER_ATTR_MAX, nl_classifier_genl_policy); + + switch (gnlh->cmd) { + case NL_CLASSIFIER_CMD_ACCEL_OK: + printf("Acceleration successful:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + case NL_CLASSIFIER_CMD_CONNECTION_CLOSED: + printf("Connection is closed:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + default: + printf("nl classifier received unknow message %d\n", gnlh->cmd); + } + + return NL_SKIP; +} + +void nl_classifier_offload(struct nl_classifier_instance *inst, + unsigned char proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short sport, + unsigned short dport, int af) +{ + struct nl_msg *msg; + int ret; + struct nl_classifier_tuple classifier_msg; + + memset(&classifier_msg, 0, sizeof(classifier_msg)); + classifier_msg.af = af; + classifier_msg.proto = proto; + memcpy(&classifier_msg.src_ip, src_saddr, (af == AF_INET ? 4 : 16)); + memcpy(&classifier_msg.dst_ip, dst_saddr, (af == AF_INET ? 4 : 16)); + classifier_msg.sport = sport; + classifier_msg.dport = dport; + + msg = nlmsg_alloc(); + if (!msg) { + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, inst->family_id, + NL_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + NL_CLASSIFIER_CMD_ACCEL, NL_CLASSIFIER_GENL_VERSION); + nla_put(msg, NL_CLASSIFIER_ATTR_TUPLE, sizeof(classifier_msg), &classifier_msg); + + ret = nl_send_auto(inst->sock, msg); + if (ret < 0) { + printf("send netlink message failed.\n"); + nlmsg_free(msg); + return; + } + + nlmsg_free(msg); + printf("nl classifier offload connection successful\n"); +} + +int nl_classifier_init(struct nl_classifier_instance *inst) +{ + int ret; + + inst->sock = nl_socket_alloc(); + if (!inst->sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(inst->sock); + + inst->family_id = genl_ctrl_resolve(inst->sock, NL_CLASSIFIER_GENL_FAMILY); + if (inst->family_id < 0) { + printf("Unable to resolve family %s\n", NL_CLASSIFIER_GENL_FAMILY); + goto init_failed; + } + + inst->group_id = genl_ctrl_resolve_grp(inst->sock, NL_CLASSIFIER_GENL_FAMILY, NL_CLASSIFIER_GENL_GROUP); + if (inst->group_id < 0) { + printf("Unable to resolve mcast group %s\n", NL_CLASSIFIER_GENL_GROUP); + goto init_failed; + } + + ret = nl_socket_add_membership(inst->sock, inst->group_id); + if (ret < 0) { + printf("Unable to add membership\n"); + goto init_failed; + } + + nl_socket_disable_seq_check(inst->sock); + nl_socket_modify_cb(inst->sock, NL_CB_VALID, NL_CB_CUSTOM, nl_classifier_msg_recv, NULL); + + printf("nl classifier init successful\n"); + return 0; + +init_failed: + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + return -1; +} + +void nl_classifier_exit(struct nl_classifier_instance *inst) +{ + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + printf("nl classifier exit successful\n"); +} + +int nl_classifier_parse_arg(int argc, char *argv[], unsigned char *proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short *sport, unsigned short *dport, int *af) +{ + int ret; + unsigned short port; + + if (argc < 7) { + printf("help: nl_classifier \n"); + return -1; + } + + if (0 == strncmp(argv[1], "v4", 2)) { + *af = AF_INET; + } else if (0 == strncmp(argv[1], "v6", 2)) { + *af = AF_INET6; + } else { + printf("Address family is not supported"); + return -1; + } + + if (0 == strncmp(argv[2], "udp", 3)) { + *proto = IPPROTO_UDP; + } else if (0 == strncmp(argv[2], "tcp", 3)) { + *proto = IPPROTO_TCP; + } else { + printf("Protocol is not supported"); + return -1; + } + + ret = inet_pton(*af, argv[3], src_saddr); + if (ret <= 0) { + printf("source ip has wrong format\n"); + return -1; + } + + ret = inet_pton(*af, argv[4], dst_saddr); + if (ret <= 0) { + printf("destination ip has wrong format\n"); + return -1; + } + + port = strtol(argv[5], NULL, 0); + *sport = htons(port); + port = strtol(argv[6], NULL, 0); + *dport = htons(port); + + printf("nl classifier parse arguments successful\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct nl_classifier_instance *inst = &nl_cls_inst; + unsigned char proto; + unsigned long src_addr[4]; + unsigned long dst_addr[4]; + unsigned short sport; + unsigned short dport; + int af; + int ret; + + ret = nl_classifier_parse_arg(argc, argv, &proto, src_addr, dst_addr, &sport, &dport, &af); + if (ret < 0) { + printf("Failed to parse arguments\n"); + return ret; + } + + ret = nl_classifier_init(inst); + if (ret < 0) { + printf("Unable to init generic netlink\n"); + return ret; + } + + nl_classifier_offload(inst, proto, src_addr, dst_addr, sport, dport, af); + + /* main loop to listen on message */ + while (!inst->stop) { + nl_recvmsgs_default(inst->sock); + } + + nl_classifier_exit(inst); + + return 0; +} diff --git a/shortcut-fe/fast-classifier/userspace_example.c b/shortcut-fe/fast-classifier/userspace_example.c new file mode 100644 index 000000000..4f4113d99 --- /dev/null +++ b/shortcut-fe/fast-classifier/userspace_example.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +static struct nl_sock *sock; +static struct nl_sock *sock_event; +static int family; +static int grp_id; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void dump_fc_tuple(struct fast_classifier_tuple *fc_msg) +{ + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; + + printf("TUPLE: %d, %s, %s, %d, %d" + " SMAC=%02x:%02x:%02x:%02x:%02x:%02x", + " DMAC=%02x:%02x:%02x:%02x:%02x:%02x\n", + fc_msg->proto, + inet_ntop(AF_INET, + &fc_msg->src_saddr.in.s_addr, + src_str, + INET_ADDRSTRLEN), + inet_ntop(AF_INET, + &fc_msg->dst_saddr.in.s_addr, + dst_str, + INET_ADDRSTRLEN), + fc_msg->sport, fc_msg->dport, + fc_msg->smac[0], fc_msg->smac[1], fc_msg->smac[2], + fc_msg->smac[3], fc_msg->smac[4], fc_msg->smac[5], + fc_msg->dmac[0], fc_msg->dmac[1], fc_msg->dmac[2], + fc_msg->dmac[3], fc_msg->dmac[4], fc_msg->dmac[5]); +} + +static int parse_cb(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[FAST_CLASSIFIER_A_MAX]; + + genlmsg_parse(nlh, 0, attrs, FAST_CLASSIFIER_A_MAX, fast_classifier_genl_policy); + + switch (gnlh->cmd) { + case FAST_CLASSIFIER_C_OFFLOADED: + printf("Got a offloaded message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + case FAST_CLASSIFIER_C_DONE: + printf("Got a done message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + } + + return NL_SKIP; +} + +int fast_classifier_init(void) +{ + int err; + + sock = nl_socket_alloc(); + if (!sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock); + + sock_event = nl_socket_alloc(); + if (!sock_event) { + nl_close(sock); + nl_socket_free(sock); + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock_event); + + family = genl_ctrl_resolve(sock, FAST_CLASSIFIER_GENL_NAME); + if (family < 0) { + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock); + nl_socket_free(sock_event); + printf("Unable to resolve family\n"); + return -1; + } + + grp_id = genl_ctrl_resolve_grp(sock, FAST_CLASSIFIER_GENL_NAME, + FAST_CLASSIFIER_GENL_MCGRP); + if (grp_id < 0) { + printf("Unable to resolve mcast group\n"); + return -1; + } + + err = nl_socket_add_membership(sock_event, grp_id); + if (err < 0) { + printf("Unable to add membership\n"); + return -1; + } + + nl_socket_disable_seq_check(sock_event); + nl_socket_modify_cb(sock_event, NL_CB_VALID, NL_CB_CUSTOM, parse_cb, NULL); + + return 0; +} + +void fast_classifier_close(void) +{ + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock_event); + nl_socket_free(sock); +} + +void fast_classifier_ipv4_offload(unsigned char proto, unsigned long src_saddr, + unsigned long dst_saddr, unsigned short sport, + unsigned short dport) +{ + struct nl_msg *msg; + int ret; +#ifdef DEBUG + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; +#endif + struct fast_classifier_tuple fc_msg; + +#ifdef DEBUG + printf("DEBUG: would offload: %d, %s, %s, %d, %d\n", proto, + inet_ntop(AF_INET, &src_saddr, src_str, INET_ADDRSTRLEN), + inet_ntop(AF_INET, &dst_saddr, dst_str, INET_ADDRSTRLEN), + sport, dport); +#endif + + fc_msg.proto = proto; + fc_msg.src_saddr.in.s_addr = src_saddr; + fc_msg.dst_saddr.in.s_addr = dst_saddr; + fc_msg.sport = sport; + fc_msg.dport = dport; + fc_msg.smac[0] = 'a'; + fc_msg.smac[1] = 'b'; + fc_msg.smac[2] = 'c'; + fc_msg.smac[3] = 'd'; + fc_msg.smac[4] = 'e'; + fc_msg.smac[5] = 'f'; + fc_msg.dmac[0] = 'f'; + fc_msg.dmac[1] = 'e'; + fc_msg.dmac[2] = 'd'; + fc_msg.dmac[3] = 'c'; + fc_msg.dmac[4] = 'b'; + fc_msg.dmac[5] = 'a'; + + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + msg = nlmsg_alloc(); + if (!msg) { + nl_socket_free(sock); + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, family, + FAST_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + FAST_CLASSIFIER_C_OFFLOAD, FAST_CLASSIFIER_GENL_VERSION); + nla_put(msg, 1, sizeof(fc_msg), &fc_msg); + + ret = nl_send_auto_complete(sock, msg); + + nlmsg_free(msg); + if (ret < 0) { + printf("nlmsg_free failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } + + ret = nl_wait_for_ack(sock); + if (ret < 0) { + printf("wait for ack failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } +} + +void fast_classifier_listen_for_messages(void) +{ + printf("waiting for netlink events\n"); + + while (1) { + nl_recvmsgs_default(sock_event); + } +} + +int main(int argc, char *argv[]) +{ + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + fast_classifier_ipv4_offload('a', 0, 0, 0, 0); + + /* this never returns */ + fast_classifier_listen_for_messages(); + + fast_classifier_close(); + + return 0; +} diff --git a/shortcut-fe/patches/Kconfig.patch b/shortcut-fe/patches/Kconfig.patch new file mode 100644 index 000000000..7df0abe83 --- /dev/null +++ b/shortcut-fe/patches/Kconfig.patch @@ -0,0 +1,12 @@ +diff --git a/net/Kconfig b/net/Kconfig +index 976cb63..4a7b0af 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -236,6 +236,7 @@ source "net/dcb/Kconfig" + source "net/dns_resolver/Kconfig" + source "net/batman-adv/Kconfig" + source "net/openvswitch/Kconfig" ++source "net/shortcut-fe/Kconfig" + + config RPS + boolean "RPS" diff --git a/shortcut-fe/patches/Makefile.patch b/shortcut-fe/patches/Makefile.patch new file mode 100644 index 000000000..1ccc2b0ec --- /dev/null +++ b/shortcut-fe/patches/Makefile.patch @@ -0,0 +1,10 @@ +diff --git a/net/Makefile b/net/Makefile +index 6865dab..a8f0091 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -71,3 +71,5 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ + obj-$(CONFIG_NFC) += nfc/ + obj-$(CONFIG_OPENVSWITCH) += openvswitch/ + obj-$(CONFIG_NET_ACTIVITY_STATS) += activity_stats.o ++obj-$(CONFIG_SHORTCUT_FE) += shortcut-fe/ ++ diff --git a/shortcut-fe/patches/dev.c.patch b/shortcut-fe/patches/dev.c.patch new file mode 100644 index 000000000..55fc03a74 --- /dev/null +++ b/shortcut-fe/patches/dev.c.patch @@ -0,0 +1,43 @@ +diff --git a/net/core/dev.c b/net/core/dev.c +index d23742f..1f0415f 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3168,6 +3168,9 @@ void netdev_rx_handler_unregister(struct net_device *dev) + } + EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + ++int (*athrs_fast_nat_recv)(struct sk_buff *skb) __rcu __read_mostly; ++EXPORT_SYMBOL_GPL(athrs_fast_nat_recv); ++ + static int __netif_receive_skb(struct sk_buff *skb) + { + struct packet_type *ptype, *pt_prev; +@@ -3177,6 +3180,7 @@ static int __netif_receive_skb(struct sk_buff *skb) + bool deliver_exact = false; + int ret = NET_RX_DROP; + __be16 type; ++ int (*fast_recv)(struct sk_buff *skb); + + net_timestamp_check(!netdev_tstamp_prequeue, skb); + +@@ -3194,10 +3198,18 @@ static int __netif_receive_skb(struct sk_buff *skb) + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + +- pt_prev = NULL; +- + rcu_read_lock(); + ++ fast_recv = rcu_dereference(athrs_fast_nat_recv); ++ if (fast_recv) { ++ if (fast_recv(skb)) { ++ rcu_read_unlock(); ++ return NET_RX_SUCCESS; ++ } ++ } ++ ++ pt_prev = NULL; ++ + another_round: + + __this_cpu_inc(softnet_data.processed); diff --git a/shortcut-fe/patches/nf_conntrack_proto_tcp.c.patch b/shortcut-fe/patches/nf_conntrack_proto_tcp.c.patch new file mode 100644 index 000000000..2cd2313a2 --- /dev/null +++ b/shortcut-fe/patches/nf_conntrack_proto_tcp.c.patch @@ -0,0 +1,25 @@ +--- /home/dhudson/mips-orig/nf_conntrack_proto_tcp.c 2013-05-07 21:32:57.153896922 +0100 ++++ nf_conntrack_proto_tcp.c 2013-06-13 16:37:40.137102438 +0100 +@@ -27,18 +27,20 @@ + #include + #include + #include + #include + + /* Do not check the TCP window for incoming packets */ +-static int nf_ct_tcp_no_window_check __read_mostly = 1; ++int nf_ct_tcp_no_window_check __read_mostly = 0; ++EXPORT_SYMBOL_GPL(nf_ct_tcp_no_window_check); + + /* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +-static int nf_ct_tcp_be_liberal __read_mostly = 0; ++int nf_ct_tcp_be_liberal __read_mostly = 0; ++EXPORT_SYMBOL_GPL(nf_ct_tcp_be_liberal); + + /* If it is set to zero, we disable picking up already established + connections. */ + static int nf_ct_tcp_loose __read_mostly = 1; + + /* Max number of the retransmitted packets without receiving an (acceptable) diff --git a/shortcut-fe/sfe.h b/shortcut-fe/sfe.h new file mode 100644 index 000000000..279e7b3dc --- /dev/null +++ b/shortcut-fe/sfe.h @@ -0,0 +1,114 @@ +/* + * sfe.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2017 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +/* + * The following are debug macros used throughout the SFE. + * + * The DEBUG_LEVEL enables the followings based on its value, + * when dynamic debug option is disabled. + * + * 0 = OFF + * 1 = ASSERTS / ERRORS + * 2 = 1 + WARN + * 3 = 2 + INFO + * 4 = 3 + TRACE + */ +#define DEBUG_LEVEL 2 + +#if (DEBUG_LEVEL < 1) +#define DEBUG_ASSERT(s, ...) +#define DEBUG_ERROR(s, ...) +#else +#define DEBUG_ASSERT(c, s, ...) if (!(c)) { pr_emerg("ASSERT: %s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__); BUG(); } +#define DEBUG_ERROR(s, ...) pr_err("%s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if defined(CONFIG_DYNAMIC_DEBUG) +/* + * Compile messages for dynamic enable/disable + */ +#define DEBUG_WARN(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define DEBUG_INFO(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define DEBUG_TRACE(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#else + +/* + * Statically compile messages at different levels + */ +#if (DEBUG_LEVEL < 2) +#define DEBUG_WARN(s, ...) +#else +#define DEBUG_WARN(s, ...) pr_warn("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if (DEBUG_LEVEL < 3) +#define DEBUG_INFO(s, ...) +#else +#define DEBUG_INFO(s, ...) pr_notice("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if (DEBUG_LEVEL < 4) +#define DEBUG_TRACE(s, ...) +#else +#define DEBUG_TRACE(s, ...) pr_info("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif +#endif + +#ifdef CONFIG_NF_FLOW_COOKIE +typedef int (*flow_cookie_set_func_t)(u32 protocol, __be32 src_ip, __be16 src_port, + __be32 dst_ip, __be16 dst_port, u16 flow_cookie); +/* + * sfe_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_register_flow_cookie_cb(flow_cookie_set_func_t cb); + +/* + * sfe_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_unregister_flow_cookie_cb(flow_cookie_set_func_t cb); + +typedef int (*sfe_ipv6_flow_cookie_set_func_t)(u32 protocol, __be32 src_ip[4], __be16 src_port, + __be32 dst_ip[4], __be16 dst_port, u16 flow_cookie); + +/* + * sfe_ipv6_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_ipv6_register_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb); + +/* + * sfe_ipv6_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_ipv6_unregister_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb); + +#endif /*CONFIG_NF_FLOW_COOKIE*/ diff --git a/shortcut-fe/sfe_backport.h b/shortcut-fe/sfe_backport.h new file mode 100644 index 000000000..d2d60c73c --- /dev/null +++ b/shortcut-fe/sfe_backport.h @@ -0,0 +1,195 @@ +/* + * sfe_backport.h + * Shortcut forwarding engine compatible header file. + * + * Copyright (c) 2014-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)) +#include +#else +enum udp_conntrack { + UDP_CT_UNREPLIED, + UDP_CT_REPLIED, + UDP_CT_MAX +}; + +static inline unsigned int * +nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, + struct nf_conntrack_l4proto *l4proto) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; + + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + return timeouts; +#else + return l4proto->get_timeouts(net); +#endif /*CONFIG_NF_CONNTRACK_TIMEOUT*/ +} +#endif /*KERNEL_VERSION(3, 7, 0)*/ +#endif /*KERNEL_VERSION(3, 4, 0)*/ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(void *priv, \ + struct sk_buff *SKB, \ + const struct nf_hook_state *state) +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(const struct nf_hook_ops *OPS, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#else +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(unsigned int HOOKNUM, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#endif + +#define sfe_cm_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define sfe_cm_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP6_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) +#define SFE_NF_CT_DEFAULT_ZONE (&nf_ct_zone_dflt) +#else +#define SFE_NF_CT_DEFAULT_ZONE NF_CT_DEFAULT_ZONE +#endif + +/* + * sfe_dev_get_master + * get master of bridge port, and hold it + */ +static inline struct net_device *sfe_dev_get_master(struct net_device *dev) +{ + struct net_device *master; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) + rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + dev_hold(master); + + rcu_read_unlock(); +#else + master = dev->master; + if (master) + dev_hold(master); +#endif + return master; +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0)) +#define SFE_DEV_EVENT_PTR(PTR) netdev_notifier_info_to_dev(PTR) +#else +#define SFE_DEV_EVENT_PTR(PTR) (struct net_device *)(PTR) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_acct *NM +#else +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_counter *NM +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_ACCT_COUNTER(NM) ((NM)->counter) +#else +#define SFE_ACCT_COUNTER(NM) (NM) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#define sfe_hash_for_each_possible(name, obj, node, member, key) \ + hash_for_each_possible(name, obj, member, key) +#else +#define sfe_hash_for_each_possible(name, obj, node, member, key) \ + hash_for_each_possible(name, obj, node, member, key) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#define sfe_hash_for_each(name, bkt, node, obj, member) \ + hash_for_each(name, bkt, obj, member) +#else +#define sfe_hash_for_each(name, bkt, node, obj, member) \ + hash_for_each(name, bkt, node, obj, member) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) +#define sfe_dst_get_neighbour(dst, daddr) dst_neigh_lookup(dst, addr) +#else +static inline struct neighbour * +sfe_dst_get_neighbour(struct dst_entry *dst, void *daddr) +{ + struct neighbour *neigh = dst_get_neighbour_noref(dst); + + if (neigh) + neigh_hold(neigh); + + return neigh; +} +#endif diff --git a/shortcut-fe/sfe_cm.c b/shortcut-fe/sfe_cm.c new file mode 100644 index 000000000..bd1bb88aa --- /dev/null +++ b/shortcut-fe/sfe_cm.c @@ -0,0 +1,1154 @@ +/* + * sfe-cm.c + * Shortcut forwarding engine connection manager. + * + * Copyright (c) 2013-2018, 2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" +#include "sfe_backport.h" + +typedef enum sfe_cm_exception { + SFE_CM_EXCEPTION_PACKET_BROADCAST, + SFE_CM_EXCEPTION_PACKET_MULTICAST, + SFE_CM_EXCEPTION_NO_IIF, + SFE_CM_EXCEPTION_NO_CT, + SFE_CM_EXCEPTION_CT_NO_TRACK, + SFE_CM_EXCEPTION_CT_NO_CONFIRM, + SFE_CM_EXCEPTION_CT_IS_ALG, + SFE_CM_EXCEPTION_IS_IPV4_MCAST, + SFE_CM_EXCEPTION_IS_IPV6_MCAST, + SFE_CM_EXCEPTION_TCP_NOT_ASSURED, + SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED, + SFE_CM_EXCEPTION_UNKNOW_PROTOCOL, + SFE_CM_EXCEPTION_NO_SRC_DEV, + SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV, + SFE_CM_EXCEPTION_NO_DEST_DEV, + SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV, + SFE_CM_EXCEPTION_NO_BRIDGE, + SFE_CM_EXCEPTION_LOCAL_OUT, + SFE_CM_EXCEPTION_MAX +} sfe_cm_exception_t; + +static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT" +}; + +/* + * Per-module structure. + */ +struct sfe_cm { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_sfe_cm; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[SFE_CM_EXCEPTION_MAX]; +}; + +static struct sfe_cm __sc; + +/* + * sfe_cm_incr_exceptions() + * increase an exception counter. + */ +static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except) +{ + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * sfe_cm_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_cm_recv(struct sk_buff *skb) +{ + struct net_device *dev; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv4_recv(dev, skb); + } + + if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv6_recv(dev, skb); + } + + DEBUG_TRACE("not IP packet\n"); + return 0; +} + +/* + * sfe_cm_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4/IPv6 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this work when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)) + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); +#else + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, NULL, 0); +#endif + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + + rcu_read_lock(); + neigh = sfe_dst_get_neighbour(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6); + } + + return false; +} + +/* + * sfe_cm_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4) +{ + struct sfe_connection_create sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_dev_tmp; + struct net_device *dest_dev_tmp; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + SFE_NF_CONN_ACCT(acct); + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + /* + * Packet to xfrm for encapsulation, we can't process it + */ + if (unlikely(skb_dst(skb)->xfrm)) { + DEBUG_TRACE("packet to xfrm, ignoring\n"); + return NF_ACCEPT; + } +#endif + + /* + * Don't process locally generated packets. + */ + if (skb->sk) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT); + DEBUG_TRACE("skip local out packet\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + if (unlikely(nf_ct_is_untracked(ct))) { +#else + if (unlikely(ctinfo == IP_CT_UNTRACKED)) { +#endif + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + /* + * Check if the acceleration of a flow could be rejected quickly. + */ + acct = nf_conn_acct_find(ct); + if (acct) { + long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets); + if ((packets > 0xff) && (packets & 0xff)) { + /* + * Connection hits slow path at least 256 times, so it must be not able to accelerate. + * But we also give it a chance to walk through ECM every 256 packets + */ + return NF_ACCEPT; + } + } + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + sic.src_td_end = ct->proto.tcp.seen[0].td_end; + sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + sic.dest_td_end = ct->proto.tcp.seen[1].td_end; + sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; + + if (nf_ct_tcp_no_window_check + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock_bh(&ct->lock); + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port), + &sic.dest_ip, ntohs(sic.dest_port)); + return NF_ACCEPT; + } + spin_unlock_bh(&ct->lock); + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; + + /* + * For packets de-capsulated from xfrm, we still can accelerate it + * on the direction we just received the packet. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0)) + if (unlikely(skb->sp)) { +#else + if (unlikely(secpath_exists(skb))) { +#endif + if (sic.protocol == IPPROTO_TCP && + !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) { + return NF_ACCEPT; + } + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + sic.reply_accel = 0; + } else { + sic.original_accel = 0; + } + } +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + src_dev = src_dev_tmp; + + if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV); + goto done1; + } + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + dest_dev = dest_dev_tmp; + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (likely(is_v4)) { + sfe_ipv4_create_rule(&sic); + } else { + sfe_ipv6_create_rule(&sic); + } + + /* + * If we had bridge ports then release them too. + */ + if (dest_br_dev) { + dev_put(dest_br_dev); + } +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } +done2: + dev_put(dest_dev_tmp); +done1: + dev_put(src_dev_tmp); + + return NF_ACCEPT; +} + +/* + * sfe_cm_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, true); +} + +/* + * sfe_cm_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, false); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * sfe_cm_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int sfe_cm_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } +#endif + + /* + * We're only interested in destroy events. + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + sfe_ipv4_destroy_rule(&sid); + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + sfe_ipv6_destroy_rule(&sid); + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block sfe_cm_conntrack_notifier = { + .notifier_call = sfe_cm_conntrack_event, +}; +#else +static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = { + .fcn = sfe_cm_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook), +#ifdef SFE_SUPPORT_IPV6 + SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook), +#endif +}; + +/* + * sfe_cm_sync_rule() + * Synchronize a connection's state. + */ +static void sfe_cm_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); +#endif + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + ct->timeout.expires += sis->delta_jiffies; +#else + ct->timeout += sis->delta_jiffies; +#endif + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) + case IPPROTO_UDP: + /* + * In Linux connection track, UDP flow has two timeout values: + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout: + * this is for uni-direction UDP flow, normally its value is 60 seconds + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream: + * this is for bi-direction UDP flow, normally its value is 180 seconds + * + * Linux will update timer of UDP flow to stream timeout once it seen packets + * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't + * see any packets. So we have to do the same thing in our stats sync message. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) { + u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + + if (reply_pkts != 0) { + unsigned int *timeouts; + struct nf_conntrack_l4proto *l4proto __maybe_unused; + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + set_bit(IPS_ASSURED_BIT, &ct->status); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0)) + l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP); + timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto); + spin_lock_bh(&ct->lock); + ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED]; + spin_unlock_bh(&ct->lock); +#else + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + timeouts = udp_get_timeouts(nf_ct_net(ct)); + } + + spin_lock_bh(&ct->lock); + ct->timeout = jiffies + timeouts[UDP_CT_REPLIED]; + spin_unlock_bh(&ct->lock); +#endif + } + } + break; +#endif /*KERNEL_VERSION(3, 4, 0)*/ + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * sfe_cm_device_event() + */ +int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet_event() + */ +static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet6_event() + */ +static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_get_exceptions + * dump exception counters + */ +static ssize_t sfe_cm_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_cm_exceptions_attr = + __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL); + +/* + * sfe_cm_init() + */ +static int __init sfe_cm_init(void) +{ + struct sfe_cm *sc = &__sc; + int result = -1; + + DEBUG_INFO("SFE CM init\n"); + + /* + * Create sys/sfe_cm + */ + sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL); + if (!sc->sys_sfe_cm) { + DEBUG_ERROR("failed to register sfe_cm\n"); + goto exit1; + } + + /* + * Create sys/sfe_cm/exceptions + */ + result = sysfs_create_file(sc->sys_sfe_cm, &sfe_cm_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + goto exit2; + } + + sc->dev_notifier.notifier_call = sfe_cm_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = sfe_cm_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = sfe_cm_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + /* + * Register our netfilter hooks. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + result = nf_register_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + + /* + * Register a notifier hook to get fast notifications of expired connections. + * Note: In CONFIG_NF_CONNTRACK_CHAIN_EVENTS enabled case, nf_conntrack_register_notifier() + * function always returns 0. + */ +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + (void)nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); +#else + result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif +#endif + + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ + BUG_ON(athrs_fast_nat_recv); + RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_cm_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule); + sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule); + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +exit4: +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + +#endif +#endif +exit3: + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); +exit2: + kobject_put(sc->sys_sfe_cm); + +exit1: + return result; +} + +/* + * sfe_cm_exit() + */ +static void __exit sfe_cm_exit(void) +{ + struct sfe_cm *sc = &__sc; + + DEBUG_INFO("SFE CM exit\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier); + +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_sfe_cm); +} + +module_init(sfe_cm_init) +module_exit(sfe_cm_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/sfe_cm.h b/shortcut-fe/sfe_cm.h new file mode 100644 index 000000000..23cbde859 --- /dev/null +++ b/shortcut-fe/sfe_cm.h @@ -0,0 +1,259 @@ +/* + * sfe_cm.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * connection flags. + */ +#define SFE_CREATE_FLAG_NO_SEQ_CHECK BIT(0) + /* Indicates that we should not check sequence numbers */ +#define SFE_CREATE_FLAG_REMARK_PRIORITY BIT(1) + /* Indicates that we should remark priority of skb */ +#define SFE_CREATE_FLAG_REMARK_DSCP BIT(2) + /* Indicates that we should remark DSCP of packet */ + +/* + * IPv6 address structure + */ +struct sfe_ipv6_addr { + __be32 addr[4]; +}; + +typedef union { + __be32 ip; + struct sfe_ipv6_addr ip6[1]; +} sfe_ip_addr_t; + +/* + * connection creation structure. + */ +struct sfe_connection_create { + int protocol; + struct net_device *src_dev; + struct net_device *dest_dev; + u32 flags; + u32 src_mtu; + u32 dest_mtu; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t src_ip_xlate; + sfe_ip_addr_t dest_ip; + sfe_ip_addr_t dest_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u8 src_mac[ETH_ALEN]; + u8 src_mac_xlate[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 dest_mac_xlate[ETH_ALEN]; + u8 src_td_window_scale; + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u8 dest_td_window_scale; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u32 mark; +#ifdef CONFIG_XFRM + u32 original_accel; + u32 reply_accel; +#endif + u32 src_priority; + u32 dest_priority; + u32 src_dscp; + u32 dest_dscp; +}; + +/* + * connection destruction structure. + */ +struct sfe_connection_destroy { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; +}; + +typedef enum sfe_sync_reason { + SFE_SYNC_REASON_STATS, /* Sync is to synchronize stats */ + SFE_SYNC_REASON_FLUSH, /* Sync is to flush a entry */ + SFE_SYNC_REASON_DESTROY /* Sync is to destroy a entry(requested by connection manager) */ +} sfe_sync_reason_t; + +/* + * Structure used to sync connection stats/state back within the system. + * + * NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing. + * 'src' is the creator of the connection. + */ +struct sfe_connection_sync { + struct net_device *src_dev; + struct net_device *dest_dev; + int is_v6; /* Is it for ipv6? */ + int protocol; /* IP protocol number (IPPROTO_...) */ + sfe_ip_addr_t src_ip; /* Non-NAT source address, i.e. the creator of the connection */ + sfe_ip_addr_t src_ip_xlate; /* NATed source address */ + __be16 src_port; /* Non-NAT source port */ + __be16 src_port_xlate; /* NATed source port */ + sfe_ip_addr_t dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */ + sfe_ip_addr_t dest_ip_xlate; /* NATed destination address */ + __be16 dest_port; /* Non-NAT destination port */ + __be16 dest_port_xlate; /* NATed destination port */ + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u64 src_packet_count; + u64 src_byte_count; + u32 src_new_packet_count; + u32 src_new_byte_count; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u64 dest_packet_count; + u64 dest_byte_count; + u32 dest_new_packet_count; + u32 dest_new_byte_count; + u32 reason; /* reason for stats sync message, i.e. destroy, flush, period sync */ + u64 delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */ +}; + +/* + * connection mark structure + */ +struct sfe_connection_mark { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; + u32 mark; +}; + +/* + * Expose the hook for the receive processing. + */ +extern int (*athrs_fast_nat_recv)(struct sk_buff *skb); + +/* + * Expose what should be a static flag in the TCP connection tracker. + */ +extern int nf_ct_tcp_no_window_check; + +/* + * This callback will be called in a timer + * at 100 times per second to sync stats back to + * Linux connection track. + * + * A RCU lock is taken to prevent this callback + * from unregistering. + */ +typedef void (*sfe_sync_rule_callback_t)(struct sfe_connection_sync *); + +/* + * IPv4 APIs used by connection manager + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv4_create_rule(struct sfe_connection_create *sic); +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv4_update_rule(struct sfe_connection_create *sic); +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark); + +#ifdef SFE_SUPPORT_IPV6 +/* + * IPv6 APIs used by connection manager + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv6_create_rule(struct sfe_connection_create *sic); +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv6_update_rule(struct sfe_connection_create *sic); +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark); +#else +static inline int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + return 0; +} + +static inline int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + return 0; +} + +static inline void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + return; +} + +static inline void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + return; +} + +static inline void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback) +{ + return; +} + +static inline void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + return; +} + +static inline void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + return; +} +#endif + +/* + * sfe_ipv6_addr_equal() + * compare ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_ipv6_addr_equal(struct sfe_ipv6_addr *a, + struct sfe_ipv6_addr *b) +{ + return a->addr[0] == b->addr[0] && + a->addr[1] == b->addr[1] && + a->addr[2] == b->addr[2] && + a->addr[3] == b->addr[3]; +} + +/* + * sfe_ipv4_addr_equal() + * compare ipv4 address + * + * return: 1, equal; 0, no equal + */ +#define sfe_ipv4_addr_equal(a, b) ((u32)(a) == (u32)(b)) + +/* + * sfe_addr_equal() + * compare ipv4 or ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_addr_equal(sfe_ip_addr_t *a, + sfe_ip_addr_t *b, int is_v4) +{ + return is_v4 ? sfe_ipv4_addr_equal(a->ip, b->ip) : sfe_ipv6_addr_equal(a->ip6, b->ip6); +} diff --git a/shortcut-fe/sfe_ipv4.c b/shortcut-fe/sfe_ipv4.c new file mode 100644 index 000000000..9f7ebd1c9 --- /dev/null +++ b/shortcut-fe/sfe_ipv4.c @@ -0,0 +1,3610 @@ +/* + * sfe_ipv4.c + * Shortcut forwarding engine - IPv4 edition. + * + * Copyright (c) 2013-2016, 2019-2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV4_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV4_UNALIGNED_IP_HEADER 1 +#if SFE_IPV4_UNALIGNED_IP_HEADER +#define SFE_IPV4_UNALIGNED_STRUCT __attribute__((packed)) +#else +#define SFE_IPV4_UNALIGNED_STRUCT +#endif + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV4_UNALIGNED_STRUCT; + +#define SFE_IPV4_DSCP_MASK 0x3 +#define SFE_IPV4_DSCP_SHIFT 2 + +/* + * An IPv4 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ihl:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + ihl:4; +#else +#error "Please fix " +#endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; + + /* + * The options start here. + */ +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv4 TCP connection match additional data. + */ +struct sfe_ipv4_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv4 connection matching entry. + */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv4 connection matching structure. + */ +struct sfe_ipv4_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv4_connection_match *next; + struct sfe_ipv4_connection_match *prev; + struct sfe_ipv4_connection *connection; + struct sfe_ipv4_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in *connection */ + struct sfe_ipv4_connection_match *active_next; + struct sfe_ipv4_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + __be32 match_src_ip; /* Source IP address */ + __be32 match_dest_ip; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_NF_FLOW_COOKIE + u32 flow_cookie; /* used flow cookie, for debug */ +#endif +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv4_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + __be32 xlate_src_ip; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + u16 xlate_src_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after source translation */ + + __be32 xlate_dest_ip; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + u16 xlate_dest_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv4_connection { + struct sfe_ipv4_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv4_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + __be32 src_ip; /* Src IP addr pre-translation */ + __be32 src_ip_xlate; /* Src IP addr post-translation */ + __be32 dest_ip; /* Dest IP addr pre-translation */ + __be32 dest_ip_xlate; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv4_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv4_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv4_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv4_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv4 connections and hash table size information. + */ +#define SFE_IPV4_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT) +#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1) + +#ifdef CONFIG_NF_FLOW_COOKIE +#define SFE_FLOW_COOKIE_SIZE 2048 +#define SFE_FLOW_COOKIE_MASK 0x7ff + +struct sfe_flow_cookie_entry { + struct sfe_ipv4_connection_match *match; + unsigned long last_clean_time; +}; +#endif + +enum sfe_ipv4_exception_events { + SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV4_EXCEPTION_EVENT_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV4_HEADER_INCOMPLETE", + "ICMP_IPV4_NON_V4", + "ICMP_IPV4_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV4_UDP_HEADER_INCOMPLETE", + "ICMP_IPV4_TCP_HEADER_INCOMPLETE", + "ICMP_IPV4_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V4", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv4 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv4_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv4_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv4_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv4_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ +#ifdef CONFIG_NF_FLOW_COOKIE + struct sfe_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; + /* flow cookie table*/ + flow_cookie_set_func_t flow_cookie_set_func; + /* function used to configure flow cookie in hardware*/ + int flow_cookie_enable; + /* Enable/disable flow cookie at runtime */ +#endif + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv4 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv4 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv4 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv4 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv4 connection flushes */ + u32 packets_forwarded; /* Number of IPv4 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv4 packets not forwarded */ + u32 exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv4 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv4 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv4 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv4 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv4 connection flushes */ + u64 packets_forwarded64; /* Number of IPv4 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv4 packets not forwarded */ + u64 exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv4; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv4_debug_xml_states { + SFE_IPV4_DEBUG_XML_STATE_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_STATS, + SFE_IPV4_DEBUG_XML_STATE_END, + SFE_IPV4_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv4_debug_xml_write_state { + enum sfe_ipv4_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws); + +static struct sfe_ipv4 __si; + +/* + * sfe_ipv4_gen_ip_csum() + * Generate the IP checksum for an IPv4 header. + * + * Note that this function assumes that we have only 20 bytes of IP header. + */ +static inline u16 sfe_ipv4_gen_ip_csum(struct sfe_ipv4_ip_hdr *iph) +{ + u32 sum; + u16 *i = (u16 *)iph; + + iph->check = 0; + + /* + * Generate the sum. + */ + sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9]; + + /* + * Fold it to ones-complement form. + */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + + return (u16)sum ^ 0xffff; +} + +/* + * sfe_ipv4_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + size_t dev_addr = (size_t)dev; + u32 hash = ((u32)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection_match() + * Get the IPv4 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv4_connection_match * +sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (cm->match_src_ip == src_ip) + && (cm->match_dest_ip == dest_ip) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || cm->match_src_ip != src_ip + || cm->match_dest_ip != dest_ip + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv4_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv4_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm) +{ + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 src_ip_hi = cm->match_src_ip >> 16; + u16 src_ip_lo = cm->match_src_ip & 0xffff; + u32 xlate_src_ip = ~cm->xlate_src_ip; + u16 xlate_src_ip_hi = xlate_src_ip >> 16; + u16 xlate_src_ip_lo = xlate_src_ip & 0xffff; + u16 xlate_src_port = ~cm->xlate_src_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = src_ip_hi + src_ip_lo + cm->match_src_port + + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 dest_ip_hi = cm->match_dest_ip >> 16; + u16 dest_ip_lo = cm->match_dest_ip & 0xffff; + u32 xlate_dest_ip = ~cm->xlate_dest_ip; + u16 xlate_dest_ip_hi = xlate_dest_ip >> 16; + u16 xlate_dest_ip_lo = xlate_dest_ip & 0xffff; + u16 xlate_dest_port = ~cm->xlate_dest_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port + + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = ~cm->match_src_ip + cm->xlate_src_ip; + if (adj < cm->xlate_src_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_partial_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = ~cm->match_dest_ip + cm->xlate_dest_ip; + if (adj < cm->xlate_dest_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_partial_csum_adjustment = (u16)adj; + } + +} + +/* + * sfe_ipv4_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, + struct sfe_ipv4_connection_match *cm) +{ + struct sfe_ipv4_connection_match **hash_head; + struct sfe_ipv4_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; + +#ifdef CONFIG_NF_FLOW_COOKIE + if (!si->flow_cookie_enable) + return; + + /* + * Configure hardware to put a flow cookie in packet of this flow, + * then we can accelerate the lookup process when we received this packet. + */ + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { + flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + if (!func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port, conn_match_idx)) { + entry->match = cm; + cm->flow_cookie = conn_match_idx; + } + } + rcu_read_unlock(); + + break; + } + } +#endif +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm) +{ +#ifdef CONFIG_NF_FLOW_COOKIE + if (si->flow_cookie_enable) { + /* + * Tell hardware that we no longer need a flow cookie in packet of this flow + */ + unsigned int conn_match_idx; + + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if (cm == entry->match) { + flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port, 0); + } + rcu_read_unlock(); + + cm->flow_cookie = 0; + entry->match = NULL; + entry->last_clean_time = jiffies; + break; + } + } + } +#endif + + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv4_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_hash(u8 protocol, __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + u32 hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection() + * Get the IPv4 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, u32 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection *c; + unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (c->src_ip == src_ip) + && (c->dest_ip == dest_ip) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || c->src_ip != src_ip + || c->dest_ip != dest_ip + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv4_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv4_find_sfe_ipv4_connection(si, mark->protocol, + mark->src_ip.ip, mark->src_port, + mark->dest_ip.ip, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + struct sfe_ipv4_connection **hash_head; + struct sfe_ipv4_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match); + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection() + * Remove a sfe_ipv4_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match); + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv4_sync_sfe_ipv4_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 0; + sis->protocol = c->protocol; + sis->src_ip.ip = c->src_ip; + sis->src_ip_xlate.ip = c->src_ip_xlate; + sis->dest_ip.ip = c->dest_ip; + sis->dest_ip_xlate.ip = c->dest_ip_xlate; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv4_flush_sfe_ipv4_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, + struct sfe_ipv4_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv4_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_udp_hdr *udph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + u8 ttl; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_udp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and udph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv4_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv4_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_tcp_hdr *tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + u8 ttl; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_tcp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcp_hdr))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv4_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV4_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and tcph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl) +{ + struct icmphdr *icmph; + struct sfe_ipv4_ip_hdr *icmp_iph; + unsigned int icmp_ihl_words; + unsigned int icmp_ihl; + u32 *icmp_trans_h; + struct sfe_ipv4_udp_hdr *icmp_udph; + struct sfe_ipv4_tcp_hdr *icmp_tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection *c; + u32 pull_len = sizeof(struct icmphdr) + ihl; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmphdr *)(skb->data + ihl); + if ((icmph->type != ICMP_DEST_UNREACH) + && (icmph->type != ICMP_TIME_EXCEEDED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type); + return 0; + } + + /* + * Do we have the full embedded IP header? + */ + len -= sizeof(struct icmphdr); + pull_len += sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv4_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + /* + * Do we have the full embedded IP header, including any options? + */ + icmp_ihl_words = icmp_iph->ihl; + icmp_ihl = icmp_ihl_words << 2; + pull_len += icmp_ihl - sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded header not large enough for IP options\n"); + return 0; + } + + len -= icmp_ihl; + icmp_trans_h = ((u32 *)icmp_iph) + icmp_ihl_words; + + /* + * Handle the embedded transport layer header. + */ + switch (icmp_iph->protocol) { + case IPPROTO_UDP: + /* + * We should have 8 bytes of UDP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded UDP header\n"); + return 0; + } + + icmp_udph = (struct sfe_ipv4_udp_hdr *)icmp_trans_h; + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + /* + * We should have 8 bytes of TCP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded TCP header\n"); + return 0; + } + + icmp_tcph = (struct sfe_ipv4_tcp_hdr *)icmp_trans_h; + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol); + return 0; + } + + src_ip = icmp_iph->saddr; + dest_ip = icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv4_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv4 *si = &__si; + unsigned int len; + unsigned int tot_len; + unsigned int frag_off; + unsigned int ihl; + bool flush_on_find; + bool ip_options; + struct sfe_ipv4_ip_hdr *iph; + u32 protocol; + + /* + * Check that we have space for an IP header here. + */ + len = skb->len; + if (unlikely(!pskb_may_pull(skb, sizeof(struct sfe_ipv4_ip_hdr)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Check that our "total length" is large enough for an IP header. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tot_len = ntohs(iph->tot_len); + if (unlikely(tot_len < sizeof(struct sfe_ipv4_ip_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u is too short\n", tot_len); + return 0; + } + + /* + * Is our IP version wrong? + */ + if (unlikely(iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + if (unlikely(tot_len > len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len); + return 0; + } + + /* + * Do we have a non-initial fragment? + */ + frag_off = ntohs(iph->frag_off); + if (unlikely(frag_off & IP_OFFSET)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + + /* + * If we have a (first) fragment then mark it to cause any connection to flush. + */ + flush_on_find = unlikely(frag_off & IP_MF) ? true : false; + + /* + * Do we have any IP options? That's definite a slow path! If we do have IP + * options we need to recheck our header size. + */ + ihl = iph->ihl << 2; + ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_ip_hdr)) ? true : false; + if (unlikely(ip_options)) { + if (unlikely(len < ihl)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl); + return 0; + } + + flush_on_find = true; + } + + protocol = iph->protocol; + if (IPPROTO_UDP == protocol) { + return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == protocol) { + return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMP == protocol) { + return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol); + return 0; +} + +static void +sfe_ipv4_update_tcp_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection_match *orig_cm; + struct sfe_ipv4_connection_match *repl_cm; + struct sfe_ipv4_tcp_connection_match *orig_tcp; + struct sfe_ipv4_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +static void +sfe_ipv4_update_protocol_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv4_update_tcp_state(c, sic); + break; + } +} + +void sfe_ipv4_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + sfe_ipv4_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv4_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv4_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pxM:%pI4:%u, d: %s:%pxM:%pI4:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, &sic->src_ip.ip, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, &sic->dest_ip.ip, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip = sic->src_ip.ip; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip = sic->dest_ip.ip; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip = sic->src_ip_xlate.ip; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip = sic->dest_ip_xlate.ip; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV4_DSCP_SHIFT; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + original_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip = sic->dest_ip_xlate.ip; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip = sic->src_ip_xlate.ip; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip = sic->dest_ip.ip; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip = sic->src_ip.ip; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV4_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + reply_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (sic->dest_ip.ip != sic->dest_ip_xlate.ip || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (sic->src_ip.ip != sic->src_ip_xlate.ip || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip = sic->src_ip.ip; + c->src_ip_xlate = sic->src_ip_xlate.ip; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip = sic->dest_ip.ip; + c->dest_ip_xlate = sic->dest_ip_xlate.ip; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv4_connection_match_compute_translations(original_cm); + sfe_ipv4_connection_match_compute_translations(reply_cm); + sfe_ipv4_insert_sfe_ipv4_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pxM(%pxM):%pI4(%pI4):%u(%u)\n" + " d: %s:%pxM(%pxM):%pI4(%pI4):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + &sic->src_ip.ip, &sic->src_ip_xlate.ip, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + &sic->dest_ip.ip, &sic->dest_ip_xlate.ip, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv4_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip.ip, sid->src_port, + sid->dest_ip.ip, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip, ntohs(sid->src_port), + &sid->dest_ip, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip.ip, ntohs(sid->src_port), + &sid->dest_ip.ip, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv4_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_get_debug_dev() + */ +static ssize_t sfe_ipv4_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv4_get_debug_dev, NULL); + +/* + * sfe_ipv4_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv4_periodic_sync() + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) +static void sfe_ipv4_periodic_sync(unsigned long arg) +#else +static void sfe_ipv4_periodic_sync(struct timer_list *tl) +#endif +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg; +#else + struct sfe_ipv4 *si = from_timer(si, tl, timer); +#endif + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + struct sfe_ipv4_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * sfe_ipv4_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + __be32 src_ip; + __be32 src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + __be32 dest_ip; + __be32 dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; +#ifdef CONFIG_NF_FLOW_COOKIE + int src_flow_cookie, dst_flow_cookie; +#endif + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip; + src_ip_xlate = c->src_ip_xlate; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip; + dest_ip_xlate = c->dest_ip_xlate; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie = original_cm->flow_cookie; + dst_flow_cookie = reply_cm->flow_cookie; +#endif + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie, dst_flow_cookie, +#endif + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv4_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = { + sfe_ipv4_debug_dev_read_start, + sfe_ipv4_debug_dev_read_connections_start, + sfe_ipv4_debug_dev_read_connections_connection, + sfe_ipv4_debug_dev_read_connections_end, + sfe_ipv4_debug_dev_read_exceptions_start, + sfe_ipv4_debug_dev_read_exceptions_exception, + sfe_ipv4_debug_dev_read_exceptions_end, + sfe_ipv4_debug_dev_read_stats, + sfe_ipv4_debug_dev_read_end, +}; + +/* + * sfe_ipv4_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv4_debug_xml_write_state *ws; + struct sfe_ipv4 *si = &__si; + + ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv4_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv4_debug_dev_open() + */ +static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (!ws) { + ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV4_DEBUG_XML_STATE_START; + file->private_data = ws; + } + + return 0; +} + +/* + * sfe_ipv4_debug_dev_release() + */ +static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv4_debug_dev_fops = { + .read = sfe_ipv4_debug_dev_read, + .write = sfe_ipv4_debug_dev_write, + .open = sfe_ipv4_debug_dev_open, + .release = sfe_ipv4_debug_dev_release +}; + +#ifdef CONFIG_NF_FLOW_COOKIE +/* + * sfe_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_register_flow_cookie_cb(flow_cookie_set_func_t cb) +{ + struct sfe_ipv4 *si = &__si; + + BUG_ON(!cb); + + if (si->flow_cookie_set_func) { + return -1; + } + + rcu_assign_pointer(si->flow_cookie_set_func, cb); + return 0; +} + +/* + * sfe_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_unregister_flow_cookie_cb(flow_cookie_set_func_t cb) +{ + struct sfe_ipv4 *si = &__si; + + RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); + return 0; +} + +/* + * sfe_ipv4_get_flow_cookie() + */ +static ssize_t sfe_ipv4_get_flow_cookie(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); +} + +/* + * sfe_ipv4_set_flow_cookie() + */ +static ssize_t sfe_ipv4_set_flow_cookie(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct sfe_ipv4 *si = &__si; + strict_strtol(buf, 0, (long int *)&si->flow_cookie_enable); + + return size; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_flow_cookie_attr = + __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv4_get_flow_cookie, sfe_ipv4_set_flow_cookie); +#endif /*CONFIG_NF_FLOW_COOKIE*/ + +/* + * sfe_ipv4_init() + */ +static int __init sfe_ipv4_init(void) +{ + struct sfe_ipv4 *si = &__si; + int result = -1; + + DEBUG_INFO("SFE IPv4 init\n"); + + /* + * Create sys/sfe_ipv4 + */ + si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL); + if (!si->sys_sfe_ipv4) { + DEBUG_ERROR("failed to register sfe_ipv4\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + +#ifdef CONFIG_NF_FLOW_COOKIE + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); + if (result) { + DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); + goto exit3; + } +#endif /* CONFIG_NF_FLOW_COOKIE */ + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit4; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si); +#else + timer_setup(&si->timer, sfe_ipv4_periodic_sync, 0); +#endif + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit4: +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); + +exit3: +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv4); + +exit1: + return result; +} + +/* + * sfe_ipv4_exit() + */ +static void __exit sfe_ipv4_exit(void) +{ + struct sfe_ipv4 *si = &__si; + + DEBUG_INFO("SFE IPv4 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv4"); + +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv4); + +} + +module_init(sfe_ipv4_init) +module_exit(sfe_ipv4_exit) + +EXPORT_SYMBOL(sfe_ipv4_recv); +EXPORT_SYMBOL(sfe_ipv4_create_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv4_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv4_mark_rule); +EXPORT_SYMBOL(sfe_ipv4_update_rule); +#ifdef CONFIG_NF_FLOW_COOKIE +EXPORT_SYMBOL(sfe_register_flow_cookie_cb); +EXPORT_SYMBOL(sfe_unregister_flow_cookie_cb); +#endif + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/sfe_ipv6.c b/shortcut-fe/sfe_ipv6.c new file mode 100644 index 000000000..a7cb811a9 --- /dev/null +++ b/shortcut-fe/sfe_ipv6.c @@ -0,0 +1,3617 @@ +/* + * sfe_ipv6.c + * Shortcut forwarding engine - IPv6 support. + * + * Copyright (c) 2015-2016, 2019-2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV6_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV6_UNALIGNED_IP_HEADER 1 +#if SFE_IPV6_UNALIGNED_IP_HEADER +#define SFE_IPV6_UNALIGNED_STRUCT __attribute__((packed)) +#else +#define SFE_IPV6_UNALIGNED_STRUCT +#endif + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_DSCP_MASK 0xf03f +#define SFE_IPV6_DSCP_SHIFT 2 + +/* + * An IPv6 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 priority:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + priority:4; +#else +#error "Please fix " +#endif + __u8 flow_lbl[3]; + __be16 payload_len; + __u8 nexthdr; + __u8 hop_limit; + struct sfe_ipv6_addr saddr; + struct sfe_ipv6_addr daddr; + + /* + * The extension header start here. + */ +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_EXT_HDR_HOP 0 +#define SFE_IPV6_EXT_HDR_ROUTING 43 +#define SFE_IPV6_EXT_HDR_FRAG 44 +#define SFE_IPV6_EXT_HDR_ESP 50 +#define SFE_IPV6_EXT_HDR_AH 51 +#define SFE_IPV6_EXT_HDR_NONE 59 +#define SFE_IPV6_EXT_HDR_DST 60 +#define SFE_IPV6_EXT_HDR_MH 135 + +/* + * fragmentation header + */ + +struct sfe_ipv6_frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; +}; + +#define SFE_IPV6_FRAG_OFFSET 0xfff8 + +/* + * generic IPv6 extension header + */ +struct sfe_ipv6_ext_hdr { + __u8 next_hdr; + __u8 hdr_len; + __u8 padding[6]; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV6_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv6 TCP connection match additional data. + */ +struct sfe_ipv6_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv6 connection matching entry. + */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv6 connection matching structure. + */ +struct sfe_ipv6_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv6_connection_match *next; + struct sfe_ipv6_connection_match *prev; + struct sfe_ipv6_connection *connection; + struct sfe_ipv6_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in connection */ + struct sfe_ipv6_connection_match *active_next; + struct sfe_ipv6_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + struct sfe_ipv6_addr match_src_ip[1]; /* Source IP address */ + struct sfe_ipv6_addr match_dest_ip[1]; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_NF_FLOW_COOKIE + u32 flow_cookie; /* used flow cookie, for debug */ +#endif +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv6_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + struct sfe_ipv6_addr xlate_src_ip[1]; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + struct sfe_ipv6_addr xlate_dest_ip[1]; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv6_connection { + struct sfe_ipv6_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv6_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + struct sfe_ipv6_addr src_ip[1]; /* Src IP addr pre-translation */ + struct sfe_ipv6_addr src_ip_xlate[1]; /* Src IP addr post-translation */ + struct sfe_ipv6_addr dest_ip[1]; /* Dest IP addr pre-translation */ + struct sfe_ipv6_addr dest_ip_xlate[1]; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv6_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv6_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv6_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv6_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv6 connections and hash table size information. + */ +#define SFE_IPV6_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV6_CONNECTION_HASH_SIZE (1 << SFE_IPV6_CONNECTION_HASH_SHIFT) +#define SFE_IPV6_CONNECTION_HASH_MASK (SFE_IPV6_CONNECTION_HASH_SIZE - 1) + +#ifdef CONFIG_NF_FLOW_COOKIE +#define SFE_FLOW_COOKIE_SIZE 2048 +#define SFE_FLOW_COOKIE_MASK 0x7ff + +struct sfe_ipv6_flow_cookie_entry { + struct sfe_ipv6_connection_match *match; + unsigned long last_clean_time; +}; +#endif + +enum sfe_ipv6_exception_events { + SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV6_EXCEPTION_EVENT_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL, + SFE_IPV6_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv6_exception_events_string[SFE_IPV6_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV6_HEADER_INCOMPLETE", + "ICMP_IPV6_NON_V6", + "ICMP_IPV6_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV6_UDP_HEADER_INCOMPLETE", + "ICMP_IPV6_TCP_HEADER_INCOMPLETE", + "ICMP_IPV6_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V6", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL", + "FLOW_COOKIE_ADD_FAIL" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv6 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv6_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv6_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv6_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv6_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv6_connection *conn_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv6_connection_match *conn_match_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ +#ifdef CONFIG_NF_FLOW_COOKIE + struct sfe_ipv6_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; + /* flow cookie table*/ + sfe_ipv6_flow_cookie_set_func_t flow_cookie_set_func; + /* function used to configure flow cookie in hardware*/ + int flow_cookie_enable; + /* Enable/disable flow cookie at runtime */ +#endif + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv6 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv6 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv6 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv6 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv6 connection flushes */ + u32 packets_forwarded; /* Number of IPv6 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv6 packets not forwarded */ + u32 exception_events[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv6 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv6 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv6 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv6 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv6 connection flushes */ + u64 packets_forwarded64; /* Number of IPv6 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv6 packets not forwarded */ + u64 exception_events64[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv6; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv6_debug_xml_states { + SFE_IPV6_DEBUG_XML_STATE_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_STATS, + SFE_IPV6_DEBUG_XML_STATE_END, + SFE_IPV6_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv6_debug_xml_write_state { + enum sfe_ipv6_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv6_debug_xml_write_method_t)(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws); + +static struct sfe_ipv6 __si6; + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, struct device_attribute *attr, char *buf); + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv6_get_debug_dev, NULL); + +/* + * sfe_ipv6_is_ext_hdr() + * check if we recognize ipv6 extension header + */ +static inline bool sfe_ipv6_is_ext_hdr(u8 hdr) +{ + return (hdr == SFE_IPV6_EXT_HDR_HOP) || + (hdr == SFE_IPV6_EXT_HDR_ROUTING) || + (hdr == SFE_IPV6_EXT_HDR_FRAG) || + (hdr == SFE_IPV6_EXT_HDR_AH) || + (hdr == SFE_IPV6_EXT_HDR_DST) || + (hdr == SFE_IPV6_EXT_HDR_MH); +} + +/* + * sfe_ipv6_change_dsfield() + * change dscp field in IPv6 packet + */ +static inline void sfe_ipv6_change_dsfield(struct sfe_ipv6_ip_hdr *iph, u8 dscp) +{ + __be16 *p = (__be16 *)iph; + + *p = ((*p & htons(SFE_IPV6_DSCP_MASK)) | htons((u16)dscp << 4)); +} + +/* + * sfe_ipv6_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_match_hash(struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + size_t dev_addr = (size_t)dev; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = ((u32)dev_addr) ^ hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection_match() + * Get the IPv6 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv6_connection_match * +sfe_ipv6_find_connection_match(struct sfe_ipv6 *si, struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv6_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (sfe_ipv6_addr_equal(cm->match_src_ip, src_ip)) + && (sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip)) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || !sfe_ipv6_addr_equal(cm->match_src_ip, src_ip) + || !sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip) + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv6_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv6_connection_match_update_summary_stats(struct sfe_ipv6_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv6_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv6_connection_match_compute_translations(struct sfe_ipv6_connection_match *cm) +{ + u32 diff[9]; + u32 *idx_32; + u16 *idx_16; + + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_src_ip->addr[0]; + *(idx_32++) = cm->match_src_ip->addr[1]; + *(idx_32++) = cm->match_src_ip->addr[2]; + *(idx_32++) = cm->match_src_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_src_port; + *(idx_16++) = ~cm->xlate_src_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_src_ip->addr[0]; + *(idx_32++) = ~cm->xlate_src_ip->addr[1]; + *(idx_32++) = ~cm->xlate_src_ip->addr[2]; + *(idx_32++) = ~cm->xlate_src_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_dest_ip->addr[0]; + *(idx_32++) = cm->match_dest_ip->addr[1]; + *(idx_32++) = cm->match_dest_ip->addr[2]; + *(idx_32++) = cm->match_dest_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_dest_port; + *(idx_16++) = ~cm->xlate_dest_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_dest_ip->addr[0]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[1]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[2]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } +} + +/* + * sfe_ipv6_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv6_update_summary_stats(struct sfe_ipv6 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV6_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv6_insert_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_insert_connection_match(struct sfe_ipv6 *si, + struct sfe_ipv6_connection_match *cm) +{ + struct sfe_ipv6_connection_match **hash_head; + struct sfe_ipv6_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; + +#ifdef CONFIG_NF_FLOW_COOKIE + if (!si->flow_cookie_enable || !(cm->flags & (SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC | SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST))) + return; + + /* + * Configure hardware to put a flow cookie in packet of this flow, + * then we can accelerate the lookup process when we received this packet. + */ + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { + sfe_ipv6_flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + if (!func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, + cm->match_dest_ip->addr, cm->match_dest_port, conn_match_idx)) { + entry->match = cm; + cm->flow_cookie = conn_match_idx; + } else { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL]++; + } + } + rcu_read_unlock(); + + break; + } + } +#endif +} + +/* + * sfe_ipv6_remove_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_remove_connection_match(struct sfe_ipv6 *si, struct sfe_ipv6_connection_match *cm) +{ +#ifdef CONFIG_NF_FLOW_COOKIE + if (si->flow_cookie_enable) { + /* + * Tell hardware that we no longer need a flow cookie in packet of this flow + */ + unsigned int conn_match_idx; + + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if (cm == entry->match) { + sfe_ipv6_flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, + cm->match_dest_ip->addr, cm->match_dest_port, 0); + } + rcu_read_unlock(); + + cm->flow_cookie = 0; + entry->match = NULL; + entry->last_clean_time = jiffies; + break; + } + } + } +#endif + + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv6_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_hash(u8 protocol, struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection() + * Get the IPv6 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv6_connection *sfe_ipv6_find_connection(struct sfe_ipv6 *si, u32 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection *c; + unsigned int conn_idx = sfe_ipv6_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (sfe_ipv6_addr_equal(c->src_ip, src_ip)) + && (sfe_ipv6_addr_equal(c->dest_ip, dest_ip)) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || !sfe_ipv6_addr_equal(c->src_ip, src_ip) + || !sfe_ipv6_addr_equal(c->dest_ip, dest_ip) + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv6_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv6_find_connection(si, mark->protocol, + mark->src_ip.ip6, mark->src_port, + mark->dest_ip.ip6, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv6_insert_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_insert_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + struct sfe_ipv6_connection **hash_head; + struct sfe_ipv6_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv6_insert_connection_match(si, c->original_match); + sfe_ipv6_insert_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv6_remove_connection() + * Remove a sfe_ipv6_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_remove_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv6_remove_connection_match(si, c->reply_match); + sfe_ipv6_remove_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv6_gen_sync_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv6_gen_sync_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 1; + sis->protocol = c->protocol; + sis->src_ip.ip6[0] = c->src_ip[0]; + sis->src_ip_xlate.ip6[0] = c->src_ip_xlate[0]; + sis->dest_ip.ip6[0] = c->dest_ip[0]; + sis->dest_ip_xlate.ip6[0] = c->dest_ip_xlate[0]; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv6_flush_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv6_flush_connection(struct sfe_ipv6 *si, + struct sfe_ipv6_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv6_gen_sync_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv6_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv6_recv_udp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_udp_hdr *udph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_udp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and udph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip[0]; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip[0]; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv6_process_tcp_option_sack(const struct sfe_ipv6_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv6_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv6_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv6_recv_tcp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_tcp_hdr *tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_tcp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv6_tcp_hdr))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv6_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv6_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV6_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and tcph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip[0]; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip[0]; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv6_recv_icmp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl) +{ + struct icmp6hdr *icmph; + struct sfe_ipv6_ip_hdr *icmp_iph; + struct sfe_ipv6_udp_hdr *icmp_udph; + struct sfe_ipv6_tcp_hdr *icmp_tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection *c; + u8 next_hdr; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, ihl + sizeof(struct icmp6hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmp6hdr *)(skb->data + ihl); + if ((icmph->icmp6_type != ICMPV6_DEST_UNREACH) + && (icmph->icmp6_type != ICMPV6_TIME_EXCEED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->icmp6_type); + return 0; + } + + /* + * Do we have the full embedded IP header? + * We should have 8 bytes of next L4 header - that's enough to identify + * the connection. + */ + len -= sizeof(struct icmp6hdr); + ihl += sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ip_hdr) + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv6_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + len -= sizeof(struct sfe_ipv6_ip_hdr); + ihl += sizeof(struct sfe_ipv6_ip_hdr); + next_hdr = icmp_iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + len -= ext_hdr_len; + ihl += ext_hdr_len; + /* + * We should have 8 bytes of next header - that's enough to identify + * the connection. + */ + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + next_hdr = ext_hdr->next_hdr; + } + + /* + * Handle the embedded transport layer header. + */ + switch (next_hdr) { + case IPPROTO_UDP: + icmp_udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + icmp_tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", next_hdr); + return 0; + } + + src_ip = &icmp_iph->saddr; + dest_ip = &icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv6_find_connection_match(si, dev, icmp_iph->nexthdr, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv6_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv6 *si = &__si6; + unsigned int len; + unsigned int payload_len; + unsigned int ihl = sizeof(struct sfe_ipv6_ip_hdr); + bool flush_on_find = false; + struct sfe_ipv6_ip_hdr *iph; + u8 next_hdr; + + /* + * Check that we have space for an IP header and an uplayer header here. + */ + len = skb->len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Is our IP version wrong? + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + if (unlikely(iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + payload_len = ntohs(iph->payload_len); + if (unlikely(payload_len > (len - ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("payload_len: %u, exceeds len: %u\n", payload_len, (len - (unsigned int)sizeof(struct sfe_ipv6_ip_hdr))); + return 0; + } + + next_hdr = iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + ihl += ext_hdr_len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + flush_on_find = true; + next_hdr = ext_hdr->next_hdr; + } + + if (IPPROTO_UDP == next_hdr) { + return sfe_ipv6_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == next_hdr) { + return sfe_ipv6_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMPV6 == next_hdr) { + return sfe_ipv6_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", next_hdr); + return 0; +} + +/* + * sfe_ipv6_update_tcp_state() + * update TCP window variables. + */ +static void +sfe_ipv6_update_tcp_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection_match *orig_cm; + struct sfe_ipv6_connection_match *repl_cm; + struct sfe_ipv6_tcp_connection_match *orig_tcp; + struct sfe_ipv6_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +/* + * sfe_ipv6_update_protocol_state() + * update protocol specified state machine. + */ +static void +sfe_ipv6_update_protocol_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv6_update_tcp_state(c, sic); + break; + } +} + +/* + * sfe_ipv6_update_rule() + * update forwarding rule after rule is created. + */ +void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + sfe_ipv6_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv6_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pxM:%pI6:%u, d: %s:%pxM:%pI6:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_ip.ip6, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, sic->dest_ip.ip6, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv6_connection *)kmalloc(sizeof(struct sfe_ipv6_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip[0] = sic->src_ip.ip6[0]; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip[0] = sic->dest_ip.ip6[0]; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip[0] = sic->src_ip_xlate.ip6[0]; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip[0] = sic->dest_ip_xlate.ip6[0]; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV6_DSCP_SHIFT; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + original_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip[0] = sic->dest_ip_xlate.ip6[0]; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip[0] = sic->src_ip_xlate.ip6[0]; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip[0] = sic->dest_ip.ip6[0]; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip[0] = sic->src_ip.ip6[0]; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV6_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + reply_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (!sfe_ipv6_addr_equal(sic->dest_ip.ip6, sic->dest_ip_xlate.ip6) || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (!sfe_ipv6_addr_equal(sic->src_ip.ip6, sic->src_ip_xlate.ip6) || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip[0] = sic->src_ip.ip6[0]; + c->src_ip_xlate[0] = sic->src_ip_xlate.ip6[0]; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip[0] = sic->dest_ip.ip6[0]; + c->dest_ip_xlate[0] = sic->dest_ip_xlate.ip6[0]; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv6_connection_match_compute_translations(original_cm); + sfe_ipv6_connection_match_compute_translations(reply_cm); + sfe_ipv6_insert_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n" + " d: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + sic->src_ip.ip6, sic->src_ip_xlate.ip6, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + sic->dest_ip.ip6, sic->dest_ip_xlate.ip6, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv6_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv6_find_connection(si, sid->protocol, sid->src_ip.ip6, sid->src_port, + sid->dest_ip.ip6, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv6_remove_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv6_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sfe_ipv6_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv6_remove_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv6_periodic_sync() + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) +static void sfe_ipv6_periodic_sync(unsigned long arg) +#else +static void sfe_ipv6_periodic_sync(struct timer_list *tl) +#endif +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + struct sfe_ipv6 *si = (struct sfe_ipv6 *)arg; +#else + struct sfe_ipv6 *si = from_timer(si, tl, timer); +#endif + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + struct sfe_ipv6_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv6_gen_sync_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +/* + * sfe_ipv6_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_connection(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + struct sfe_ipv6_addr src_ip; + struct sfe_ipv6_addr src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + struct sfe_ipv6_addr dest_ip; + struct sfe_ipv6_addr dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; +#ifdef CONFIG_NF_FLOW_COOKIE + int src_flow_cookie, dst_flow_cookie; +#endif + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip[0]; + src_ip_xlate = c->src_ip_xlate[0]; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip[0]; + dest_ip_xlate = c->dest_ip_xlate[0]; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie = original_cm->flow_cookie; + dst_flow_cookie = reply_cm->flow_cookie; +#endif + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie, dst_flow_cookie, +#endif + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_exception(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv6_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV6_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_stats(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv6_debug_xml_write_method_t sfe_ipv6_debug_xml_write_methods[SFE_IPV6_DEBUG_XML_STATE_DONE] = { + sfe_ipv6_debug_dev_read_start, + sfe_ipv6_debug_dev_read_connections_start, + sfe_ipv6_debug_dev_read_connections_connection, + sfe_ipv6_debug_dev_read_connections_end, + sfe_ipv6_debug_dev_read_exceptions_start, + sfe_ipv6_debug_dev_read_exceptions_exception, + sfe_ipv6_debug_dev_read_exceptions_end, + sfe_ipv6_debug_dev_read_stats, + sfe_ipv6_debug_dev_read_end, +}; + +/* + * sfe_ipv6_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv6_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv6_debug_xml_write_state *ws; + struct sfe_ipv6 *si = &__si6; + + ws = (struct sfe_ipv6_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV6_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv6_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv6_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv6_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv6_debug_dev_open() + */ +static int sfe_ipv6_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + return 0; + } + + ws = kzalloc(sizeof(struct sfe_ipv6_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV6_DEBUG_XML_STATE_START; + file->private_data = ws; + + return 0; +} + +/* + * sfe_ipv6_debug_dev_release() + */ +static int sfe_ipv6_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv6_debug_dev_fops = { + .read = sfe_ipv6_debug_dev_read, + .write = sfe_ipv6_debug_dev_write, + .open = sfe_ipv6_debug_dev_open, + .release = sfe_ipv6_debug_dev_release +}; + +#ifdef CONFIG_NF_FLOW_COOKIE +/* + * sfe_ipv6_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_ipv6_register_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) +{ + struct sfe_ipv6 *si = &__si6; + + BUG_ON(!cb); + + if (si->flow_cookie_set_func) { + return -1; + } + + rcu_assign_pointer(si->flow_cookie_set_func, cb); + return 0; +} + +/* + * sfe_ipv6_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_ipv6_unregister_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) +{ + struct sfe_ipv6 *si = &__si6; + + RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); + return 0; +} + +/* + * sfe_ipv6_get_flow_cookie() + */ +static ssize_t sfe_ipv6_get_flow_cookie(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); +} + +/* + * sfe_ipv6_set_flow_cookie() + */ +static ssize_t sfe_ipv6_set_flow_cookie(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct sfe_ipv6 *si = &__si6; + strict_strtol(buf, 0, (long int *)&si->flow_cookie_enable); + + return size; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_flow_cookie_attr = + __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv6_get_flow_cookie, sfe_ipv6_set_flow_cookie); +#endif /*CONFIG_NF_FLOW_COOKIE*/ + +/* + * sfe_ipv6_init() + */ +static int __init sfe_ipv6_init(void) +{ + struct sfe_ipv6 *si = &__si6; + int result = -1; + + DEBUG_INFO("SFE IPv6 init\n"); + + /* + * Create sys/sfe_ipv6 + */ + si->sys_sfe_ipv6 = kobject_create_and_add("sfe_ipv6", NULL); + if (!si->sys_sfe_ipv6) { + DEBUG_ERROR("failed to register sfe_ipv6\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + +#ifdef CONFIG_NF_FLOW_COOKIE + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); + if (result) { + DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); + goto exit3; + } +#endif /* CONFIG_NF_FLOW_COOKIE */ + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv6", &sfe_ipv6_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit4; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + setup_timer(&si->timer, sfe_ipv6_periodic_sync, (unsigned long)si); +#else + timer_setup(&si->timer, sfe_ipv6_periodic_sync, 0); +#endif + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit4: +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); + +exit3: +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv6); + +exit1: + return result; +} + +/* + * sfe_ipv6_exit() + */ +static void __exit sfe_ipv6_exit(void) +{ + struct sfe_ipv6 *si = &__si6; + + DEBUG_INFO("SFE IPv6 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv6_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv6"); + +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv6); +} + +module_init(sfe_ipv6_init) +module_exit(sfe_ipv6_exit) + +EXPORT_SYMBOL(sfe_ipv6_recv); +EXPORT_SYMBOL(sfe_ipv6_create_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv6_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv6_mark_rule); +EXPORT_SYMBOL(sfe_ipv6_update_rule); +#ifdef CONFIG_NF_FLOW_COOKIE +EXPORT_SYMBOL(sfe_ipv6_register_flow_cookie_cb); +EXPORT_SYMBOL(sfe_ipv6_unregister_flow_cookie_cb); +#endif + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv6 support"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/shortcut-fe/Kconfig b/shortcut-fe/shortcut-fe/Kconfig new file mode 100644 index 000000000..487f1e065 --- /dev/null +++ b/shortcut-fe/shortcut-fe/Kconfig @@ -0,0 +1,14 @@ +# +# Shortcut forwarding engine +# + +config SHORTCUT_FE + tristate "Shortcut Forwarding Engine" + depends on NF_CONNTRACK + ---help--- + Shortcut is a fast in-kernel packet forwarding engine. + + To compile this code as a module, choose M here: the module will be + called shortcut-fe. + + If unsure, say N. diff --git a/shortcut-fe/shortcut-fe/sfe.h b/shortcut-fe/shortcut-fe/sfe.h new file mode 100644 index 000000000..279e7b3dc --- /dev/null +++ b/shortcut-fe/shortcut-fe/sfe.h @@ -0,0 +1,114 @@ +/* + * sfe.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2017 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +/* + * The following are debug macros used throughout the SFE. + * + * The DEBUG_LEVEL enables the followings based on its value, + * when dynamic debug option is disabled. + * + * 0 = OFF + * 1 = ASSERTS / ERRORS + * 2 = 1 + WARN + * 3 = 2 + INFO + * 4 = 3 + TRACE + */ +#define DEBUG_LEVEL 2 + +#if (DEBUG_LEVEL < 1) +#define DEBUG_ASSERT(s, ...) +#define DEBUG_ERROR(s, ...) +#else +#define DEBUG_ASSERT(c, s, ...) if (!(c)) { pr_emerg("ASSERT: %s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__); BUG(); } +#define DEBUG_ERROR(s, ...) pr_err("%s:%d:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if defined(CONFIG_DYNAMIC_DEBUG) +/* + * Compile messages for dynamic enable/disable + */ +#define DEBUG_WARN(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define DEBUG_INFO(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define DEBUG_TRACE(s, ...) pr_debug("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#else + +/* + * Statically compile messages at different levels + */ +#if (DEBUG_LEVEL < 2) +#define DEBUG_WARN(s, ...) +#else +#define DEBUG_WARN(s, ...) pr_warn("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if (DEBUG_LEVEL < 3) +#define DEBUG_INFO(s, ...) +#else +#define DEBUG_INFO(s, ...) pr_notice("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif + +#if (DEBUG_LEVEL < 4) +#define DEBUG_TRACE(s, ...) +#else +#define DEBUG_TRACE(s, ...) pr_info("%s[%d]:" s, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif +#endif + +#ifdef CONFIG_NF_FLOW_COOKIE +typedef int (*flow_cookie_set_func_t)(u32 protocol, __be32 src_ip, __be16 src_port, + __be32 dst_ip, __be16 dst_port, u16 flow_cookie); +/* + * sfe_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_register_flow_cookie_cb(flow_cookie_set_func_t cb); + +/* + * sfe_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_unregister_flow_cookie_cb(flow_cookie_set_func_t cb); + +typedef int (*sfe_ipv6_flow_cookie_set_func_t)(u32 protocol, __be32 src_ip[4], __be16 src_port, + __be32 dst_ip[4], __be16 dst_port, u16 flow_cookie); + +/* + * sfe_ipv6_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_ipv6_register_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb); + +/* + * sfe_ipv6_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_ipv6_unregister_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb); + +#endif /*CONFIG_NF_FLOW_COOKIE*/ diff --git a/shortcut-fe/shortcut-fe/sfe_backport.h b/shortcut-fe/shortcut-fe/sfe_backport.h new file mode 100644 index 000000000..d2d60c73c --- /dev/null +++ b/shortcut-fe/shortcut-fe/sfe_backport.h @@ -0,0 +1,195 @@ +/* + * sfe_backport.h + * Shortcut forwarding engine compatible header file. + * + * Copyright (c) 2014-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)) +#include +#else +enum udp_conntrack { + UDP_CT_UNREPLIED, + UDP_CT_REPLIED, + UDP_CT_MAX +}; + +static inline unsigned int * +nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, + struct nf_conntrack_l4proto *l4proto) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; + + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + return timeouts; +#else + return l4proto->get_timeouts(net); +#endif /*CONFIG_NF_CONNTRACK_TIMEOUT*/ +} +#endif /*KERNEL_VERSION(3, 7, 0)*/ +#endif /*KERNEL_VERSION(3, 4, 0)*/ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(void *priv, \ + struct sk_buff *SKB, \ + const struct nf_hook_state *state) +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(const struct nf_hook_ops *OPS, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#else +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(unsigned int HOOKNUM, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#endif + +#define sfe_cm_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define sfe_cm_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP6_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) +#define SFE_NF_CT_DEFAULT_ZONE (&nf_ct_zone_dflt) +#else +#define SFE_NF_CT_DEFAULT_ZONE NF_CT_DEFAULT_ZONE +#endif + +/* + * sfe_dev_get_master + * get master of bridge port, and hold it + */ +static inline struct net_device *sfe_dev_get_master(struct net_device *dev) +{ + struct net_device *master; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) + rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + dev_hold(master); + + rcu_read_unlock(); +#else + master = dev->master; + if (master) + dev_hold(master); +#endif + return master; +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0)) +#define SFE_DEV_EVENT_PTR(PTR) netdev_notifier_info_to_dev(PTR) +#else +#define SFE_DEV_EVENT_PTR(PTR) (struct net_device *)(PTR) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_acct *NM +#else +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_counter *NM +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_ACCT_COUNTER(NM) ((NM)->counter) +#else +#define SFE_ACCT_COUNTER(NM) (NM) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#define sfe_hash_for_each_possible(name, obj, node, member, key) \ + hash_for_each_possible(name, obj, member, key) +#else +#define sfe_hash_for_each_possible(name, obj, node, member, key) \ + hash_for_each_possible(name, obj, node, member, key) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) +#define sfe_hash_for_each(name, bkt, node, obj, member) \ + hash_for_each(name, bkt, obj, member) +#else +#define sfe_hash_for_each(name, bkt, node, obj, member) \ + hash_for_each(name, bkt, node, obj, member) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) +#define sfe_dst_get_neighbour(dst, daddr) dst_neigh_lookup(dst, addr) +#else +static inline struct neighbour * +sfe_dst_get_neighbour(struct dst_entry *dst, void *daddr) +{ + struct neighbour *neigh = dst_get_neighbour_noref(dst); + + if (neigh) + neigh_hold(neigh); + + return neigh; +} +#endif diff --git a/shortcut-fe/shortcut-fe/sfe_cm.c b/shortcut-fe/shortcut-fe/sfe_cm.c new file mode 100644 index 000000000..bd1bb88aa --- /dev/null +++ b/shortcut-fe/shortcut-fe/sfe_cm.c @@ -0,0 +1,1154 @@ +/* + * sfe-cm.c + * Shortcut forwarding engine connection manager. + * + * Copyright (c) 2013-2018, 2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" +#include "sfe_backport.h" + +typedef enum sfe_cm_exception { + SFE_CM_EXCEPTION_PACKET_BROADCAST, + SFE_CM_EXCEPTION_PACKET_MULTICAST, + SFE_CM_EXCEPTION_NO_IIF, + SFE_CM_EXCEPTION_NO_CT, + SFE_CM_EXCEPTION_CT_NO_TRACK, + SFE_CM_EXCEPTION_CT_NO_CONFIRM, + SFE_CM_EXCEPTION_CT_IS_ALG, + SFE_CM_EXCEPTION_IS_IPV4_MCAST, + SFE_CM_EXCEPTION_IS_IPV6_MCAST, + SFE_CM_EXCEPTION_TCP_NOT_ASSURED, + SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED, + SFE_CM_EXCEPTION_UNKNOW_PROTOCOL, + SFE_CM_EXCEPTION_NO_SRC_DEV, + SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV, + SFE_CM_EXCEPTION_NO_DEST_DEV, + SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV, + SFE_CM_EXCEPTION_NO_BRIDGE, + SFE_CM_EXCEPTION_LOCAL_OUT, + SFE_CM_EXCEPTION_MAX +} sfe_cm_exception_t; + +static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT" +}; + +/* + * Per-module structure. + */ +struct sfe_cm { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_sfe_cm; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[SFE_CM_EXCEPTION_MAX]; +}; + +static struct sfe_cm __sc; + +/* + * sfe_cm_incr_exceptions() + * increase an exception counter. + */ +static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except) +{ + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * sfe_cm_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_cm_recv(struct sk_buff *skb) +{ + struct net_device *dev; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv4_recv(dev, skb); + } + + if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv6_recv(dev, skb); + } + + DEBUG_TRACE("not IP packet\n"); + return 0; +} + +/* + * sfe_cm_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4/IPv6 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this work when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)) + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); +#else + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, NULL, 0); +#endif + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + + rcu_read_lock(); + neigh = sfe_dst_get_neighbour(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6); + } + + return false; +} + +/* + * sfe_cm_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4) +{ + struct sfe_connection_create sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_dev_tmp; + struct net_device *dest_dev_tmp; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + SFE_NF_CONN_ACCT(acct); + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + /* + * Packet to xfrm for encapsulation, we can't process it + */ + if (unlikely(skb_dst(skb)->xfrm)) { + DEBUG_TRACE("packet to xfrm, ignoring\n"); + return NF_ACCEPT; + } +#endif + + /* + * Don't process locally generated packets. + */ + if (skb->sk) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT); + DEBUG_TRACE("skip local out packet\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + if (unlikely(nf_ct_is_untracked(ct))) { +#else + if (unlikely(ctinfo == IP_CT_UNTRACKED)) { +#endif + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + /* + * Check if the acceleration of a flow could be rejected quickly. + */ + acct = nf_conn_acct_find(ct); + if (acct) { + long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets); + if ((packets > 0xff) && (packets & 0xff)) { + /* + * Connection hits slow path at least 256 times, so it must be not able to accelerate. + * But we also give it a chance to walk through ECM every 256 packets + */ + return NF_ACCEPT; + } + } + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + sic.src_td_end = ct->proto.tcp.seen[0].td_end; + sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + sic.dest_td_end = ct->proto.tcp.seen[1].td_end; + sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; + + if (nf_ct_tcp_no_window_check + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock_bh(&ct->lock); + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port), + &sic.dest_ip, ntohs(sic.dest_port)); + return NF_ACCEPT; + } + spin_unlock_bh(&ct->lock); + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; + + /* + * For packets de-capsulated from xfrm, we still can accelerate it + * on the direction we just received the packet. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 0, 0)) + if (unlikely(skb->sp)) { +#else + if (unlikely(secpath_exists(skb))) { +#endif + if (sic.protocol == IPPROTO_TCP && + !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) { + return NF_ACCEPT; + } + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + sic.reply_accel = 0; + } else { + sic.original_accel = 0; + } + } +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + src_dev = src_dev_tmp; + + if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV); + goto done1; + } + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + dest_dev = dest_dev_tmp; + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (likely(is_v4)) { + sfe_ipv4_create_rule(&sic); + } else { + sfe_ipv6_create_rule(&sic); + } + + /* + * If we had bridge ports then release them too. + */ + if (dest_br_dev) { + dev_put(dest_br_dev); + } +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } +done2: + dev_put(dest_dev_tmp); +done1: + dev_put(src_dev_tmp); + + return NF_ACCEPT; +} + +/* + * sfe_cm_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, true); +} + +/* + * sfe_cm_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, false); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * sfe_cm_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int sfe_cm_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0)) + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } +#endif + + /* + * We're only interested in destroy events. + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + sfe_ipv4_destroy_rule(&sid); + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + sfe_ipv6_destroy_rule(&sid); + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block sfe_cm_conntrack_notifier = { + .notifier_call = sfe_cm_conntrack_event, +}; +#else +static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = { + .fcn = sfe_cm_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook), +#ifdef SFE_SUPPORT_IPV6 + SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook), +#endif +}; + +/* + * sfe_cm_sync_rule() + * Synchronize a connection's state. + */ +static void sfe_cm_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); +#endif + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0)) + ct->timeout.expires += sis->delta_jiffies; +#else + ct->timeout += sis->delta_jiffies; +#endif + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0)) + case IPPROTO_UDP: + /* + * In Linux connection track, UDP flow has two timeout values: + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout: + * this is for uni-direction UDP flow, normally its value is 60 seconds + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream: + * this is for bi-direction UDP flow, normally its value is 180 seconds + * + * Linux will update timer of UDP flow to stream timeout once it seen packets + * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't + * see any packets. So we have to do the same thing in our stats sync message. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) { + u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + + if (reply_pkts != 0) { + unsigned int *timeouts; + struct nf_conntrack_l4proto *l4proto __maybe_unused; + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + set_bit(IPS_ASSURED_BIT, &ct->status); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0)) + l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP); + timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto); + spin_lock_bh(&ct->lock); + ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED]; + spin_unlock_bh(&ct->lock); +#else + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) { + timeouts = udp_get_timeouts(nf_ct_net(ct)); + } + + spin_lock_bh(&ct->lock); + ct->timeout = jiffies + timeouts[UDP_CT_REPLIED]; + spin_unlock_bh(&ct->lock); +#endif + } + } + break; +#endif /*KERNEL_VERSION(3, 4, 0)*/ + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * sfe_cm_device_event() + */ +int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet_event() + */ +static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet6_event() + */ +static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_get_exceptions + * dump exception counters + */ +static ssize_t sfe_cm_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_cm_exceptions_attr = + __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL); + +/* + * sfe_cm_init() + */ +static int __init sfe_cm_init(void) +{ + struct sfe_cm *sc = &__sc; + int result = -1; + + DEBUG_INFO("SFE CM init\n"); + + /* + * Create sys/sfe_cm + */ + sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL); + if (!sc->sys_sfe_cm) { + DEBUG_ERROR("failed to register sfe_cm\n"); + goto exit1; + } + + /* + * Create sys/sfe_cm/exceptions + */ + result = sysfs_create_file(sc->sys_sfe_cm, &sfe_cm_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + goto exit2; + } + + sc->dev_notifier.notifier_call = sfe_cm_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = sfe_cm_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = sfe_cm_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + /* + * Register our netfilter hooks. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + result = nf_register_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + + /* + * Register a notifier hook to get fast notifications of expired connections. + * Note: In CONFIG_NF_CONNTRACK_CHAIN_EVENTS enabled case, nf_conntrack_register_notifier() + * function always returns 0. + */ +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + (void)nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); +#else + result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif +#endif + + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ + BUG_ON(athrs_fast_nat_recv); + RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_cm_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule); + sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule); + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +exit4: +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + +#endif +#endif +exit3: + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); +exit2: + kobject_put(sc->sys_sfe_cm); + +exit1: + return result; +} + +/* + * sfe_cm_exit() + */ +static void __exit sfe_cm_exit(void) +{ + struct sfe_cm *sc = &__sc; + + DEBUG_INFO("SFE CM exit\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier); + +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)) + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#else + nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_sfe_cm); +} + +module_init(sfe_cm_init) +module_exit(sfe_cm_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/shortcut-fe/sfe_cm.h b/shortcut-fe/shortcut-fe/sfe_cm.h new file mode 100644 index 000000000..23cbde859 --- /dev/null +++ b/shortcut-fe/shortcut-fe/sfe_cm.h @@ -0,0 +1,259 @@ +/* + * sfe_cm.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * connection flags. + */ +#define SFE_CREATE_FLAG_NO_SEQ_CHECK BIT(0) + /* Indicates that we should not check sequence numbers */ +#define SFE_CREATE_FLAG_REMARK_PRIORITY BIT(1) + /* Indicates that we should remark priority of skb */ +#define SFE_CREATE_FLAG_REMARK_DSCP BIT(2) + /* Indicates that we should remark DSCP of packet */ + +/* + * IPv6 address structure + */ +struct sfe_ipv6_addr { + __be32 addr[4]; +}; + +typedef union { + __be32 ip; + struct sfe_ipv6_addr ip6[1]; +} sfe_ip_addr_t; + +/* + * connection creation structure. + */ +struct sfe_connection_create { + int protocol; + struct net_device *src_dev; + struct net_device *dest_dev; + u32 flags; + u32 src_mtu; + u32 dest_mtu; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t src_ip_xlate; + sfe_ip_addr_t dest_ip; + sfe_ip_addr_t dest_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u8 src_mac[ETH_ALEN]; + u8 src_mac_xlate[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 dest_mac_xlate[ETH_ALEN]; + u8 src_td_window_scale; + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u8 dest_td_window_scale; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u32 mark; +#ifdef CONFIG_XFRM + u32 original_accel; + u32 reply_accel; +#endif + u32 src_priority; + u32 dest_priority; + u32 src_dscp; + u32 dest_dscp; +}; + +/* + * connection destruction structure. + */ +struct sfe_connection_destroy { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; +}; + +typedef enum sfe_sync_reason { + SFE_SYNC_REASON_STATS, /* Sync is to synchronize stats */ + SFE_SYNC_REASON_FLUSH, /* Sync is to flush a entry */ + SFE_SYNC_REASON_DESTROY /* Sync is to destroy a entry(requested by connection manager) */ +} sfe_sync_reason_t; + +/* + * Structure used to sync connection stats/state back within the system. + * + * NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing. + * 'src' is the creator of the connection. + */ +struct sfe_connection_sync { + struct net_device *src_dev; + struct net_device *dest_dev; + int is_v6; /* Is it for ipv6? */ + int protocol; /* IP protocol number (IPPROTO_...) */ + sfe_ip_addr_t src_ip; /* Non-NAT source address, i.e. the creator of the connection */ + sfe_ip_addr_t src_ip_xlate; /* NATed source address */ + __be16 src_port; /* Non-NAT source port */ + __be16 src_port_xlate; /* NATed source port */ + sfe_ip_addr_t dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */ + sfe_ip_addr_t dest_ip_xlate; /* NATed destination address */ + __be16 dest_port; /* Non-NAT destination port */ + __be16 dest_port_xlate; /* NATed destination port */ + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u64 src_packet_count; + u64 src_byte_count; + u32 src_new_packet_count; + u32 src_new_byte_count; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u64 dest_packet_count; + u64 dest_byte_count; + u32 dest_new_packet_count; + u32 dest_new_byte_count; + u32 reason; /* reason for stats sync message, i.e. destroy, flush, period sync */ + u64 delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */ +}; + +/* + * connection mark structure + */ +struct sfe_connection_mark { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; + u32 mark; +}; + +/* + * Expose the hook for the receive processing. + */ +extern int (*athrs_fast_nat_recv)(struct sk_buff *skb); + +/* + * Expose what should be a static flag in the TCP connection tracker. + */ +extern int nf_ct_tcp_no_window_check; + +/* + * This callback will be called in a timer + * at 100 times per second to sync stats back to + * Linux connection track. + * + * A RCU lock is taken to prevent this callback + * from unregistering. + */ +typedef void (*sfe_sync_rule_callback_t)(struct sfe_connection_sync *); + +/* + * IPv4 APIs used by connection manager + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv4_create_rule(struct sfe_connection_create *sic); +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv4_update_rule(struct sfe_connection_create *sic); +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark); + +#ifdef SFE_SUPPORT_IPV6 +/* + * IPv6 APIs used by connection manager + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv6_create_rule(struct sfe_connection_create *sic); +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv6_update_rule(struct sfe_connection_create *sic); +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark); +#else +static inline int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + return 0; +} + +static inline int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + return 0; +} + +static inline void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + return; +} + +static inline void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + return; +} + +static inline void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback) +{ + return; +} + +static inline void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + return; +} + +static inline void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + return; +} +#endif + +/* + * sfe_ipv6_addr_equal() + * compare ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_ipv6_addr_equal(struct sfe_ipv6_addr *a, + struct sfe_ipv6_addr *b) +{ + return a->addr[0] == b->addr[0] && + a->addr[1] == b->addr[1] && + a->addr[2] == b->addr[2] && + a->addr[3] == b->addr[3]; +} + +/* + * sfe_ipv4_addr_equal() + * compare ipv4 address + * + * return: 1, equal; 0, no equal + */ +#define sfe_ipv4_addr_equal(a, b) ((u32)(a) == (u32)(b)) + +/* + * sfe_addr_equal() + * compare ipv4 or ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_addr_equal(sfe_ip_addr_t *a, + sfe_ip_addr_t *b, int is_v4) +{ + return is_v4 ? sfe_ipv4_addr_equal(a->ip, b->ip) : sfe_ipv6_addr_equal(a->ip6, b->ip6); +} diff --git a/shortcut-fe/shortcut-fe/sfe_ipv4.c b/shortcut-fe/shortcut-fe/sfe_ipv4.c new file mode 100644 index 000000000..9f7ebd1c9 --- /dev/null +++ b/shortcut-fe/shortcut-fe/sfe_ipv4.c @@ -0,0 +1,3610 @@ +/* + * sfe_ipv4.c + * Shortcut forwarding engine - IPv4 edition. + * + * Copyright (c) 2013-2016, 2019-2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV4_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV4_UNALIGNED_IP_HEADER 1 +#if SFE_IPV4_UNALIGNED_IP_HEADER +#define SFE_IPV4_UNALIGNED_STRUCT __attribute__((packed)) +#else +#define SFE_IPV4_UNALIGNED_STRUCT +#endif + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV4_UNALIGNED_STRUCT; + +#define SFE_IPV4_DSCP_MASK 0x3 +#define SFE_IPV4_DSCP_SHIFT 2 + +/* + * An IPv4 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ihl:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + ihl:4; +#else +#error "Please fix " +#endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; + + /* + * The options start here. + */ +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv4 TCP connection match additional data. + */ +struct sfe_ipv4_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv4 connection matching entry. + */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv4 connection matching structure. + */ +struct sfe_ipv4_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv4_connection_match *next; + struct sfe_ipv4_connection_match *prev; + struct sfe_ipv4_connection *connection; + struct sfe_ipv4_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in *connection */ + struct sfe_ipv4_connection_match *active_next; + struct sfe_ipv4_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + __be32 match_src_ip; /* Source IP address */ + __be32 match_dest_ip; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_NF_FLOW_COOKIE + u32 flow_cookie; /* used flow cookie, for debug */ +#endif +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv4_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + __be32 xlate_src_ip; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + u16 xlate_src_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after source translation */ + + __be32 xlate_dest_ip; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + u16 xlate_dest_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv4_connection { + struct sfe_ipv4_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv4_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + __be32 src_ip; /* Src IP addr pre-translation */ + __be32 src_ip_xlate; /* Src IP addr post-translation */ + __be32 dest_ip; /* Dest IP addr pre-translation */ + __be32 dest_ip_xlate; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv4_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv4_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv4_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv4_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv4 connections and hash table size information. + */ +#define SFE_IPV4_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT) +#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1) + +#ifdef CONFIG_NF_FLOW_COOKIE +#define SFE_FLOW_COOKIE_SIZE 2048 +#define SFE_FLOW_COOKIE_MASK 0x7ff + +struct sfe_flow_cookie_entry { + struct sfe_ipv4_connection_match *match; + unsigned long last_clean_time; +}; +#endif + +enum sfe_ipv4_exception_events { + SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV4_EXCEPTION_EVENT_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV4_HEADER_INCOMPLETE", + "ICMP_IPV4_NON_V4", + "ICMP_IPV4_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV4_UDP_HEADER_INCOMPLETE", + "ICMP_IPV4_TCP_HEADER_INCOMPLETE", + "ICMP_IPV4_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V4", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv4 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv4_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv4_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv4_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv4_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ +#ifdef CONFIG_NF_FLOW_COOKIE + struct sfe_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; + /* flow cookie table*/ + flow_cookie_set_func_t flow_cookie_set_func; + /* function used to configure flow cookie in hardware*/ + int flow_cookie_enable; + /* Enable/disable flow cookie at runtime */ +#endif + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv4 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv4 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv4 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv4 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv4 connection flushes */ + u32 packets_forwarded; /* Number of IPv4 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv4 packets not forwarded */ + u32 exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv4 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv4 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv4 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv4 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv4 connection flushes */ + u64 packets_forwarded64; /* Number of IPv4 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv4 packets not forwarded */ + u64 exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv4; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv4_debug_xml_states { + SFE_IPV4_DEBUG_XML_STATE_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_STATS, + SFE_IPV4_DEBUG_XML_STATE_END, + SFE_IPV4_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv4_debug_xml_write_state { + enum sfe_ipv4_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws); + +static struct sfe_ipv4 __si; + +/* + * sfe_ipv4_gen_ip_csum() + * Generate the IP checksum for an IPv4 header. + * + * Note that this function assumes that we have only 20 bytes of IP header. + */ +static inline u16 sfe_ipv4_gen_ip_csum(struct sfe_ipv4_ip_hdr *iph) +{ + u32 sum; + u16 *i = (u16 *)iph; + + iph->check = 0; + + /* + * Generate the sum. + */ + sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9]; + + /* + * Fold it to ones-complement form. + */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + + return (u16)sum ^ 0xffff; +} + +/* + * sfe_ipv4_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + size_t dev_addr = (size_t)dev; + u32 hash = ((u32)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection_match() + * Get the IPv4 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv4_connection_match * +sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (cm->match_src_ip == src_ip) + && (cm->match_dest_ip == dest_ip) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || cm->match_src_ip != src_ip + || cm->match_dest_ip != dest_ip + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv4_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv4_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm) +{ + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 src_ip_hi = cm->match_src_ip >> 16; + u16 src_ip_lo = cm->match_src_ip & 0xffff; + u32 xlate_src_ip = ~cm->xlate_src_ip; + u16 xlate_src_ip_hi = xlate_src_ip >> 16; + u16 xlate_src_ip_lo = xlate_src_ip & 0xffff; + u16 xlate_src_port = ~cm->xlate_src_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = src_ip_hi + src_ip_lo + cm->match_src_port + + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 dest_ip_hi = cm->match_dest_ip >> 16; + u16 dest_ip_lo = cm->match_dest_ip & 0xffff; + u32 xlate_dest_ip = ~cm->xlate_dest_ip; + u16 xlate_dest_ip_hi = xlate_dest_ip >> 16; + u16 xlate_dest_ip_lo = xlate_dest_ip & 0xffff; + u16 xlate_dest_port = ~cm->xlate_dest_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port + + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = ~cm->match_src_ip + cm->xlate_src_ip; + if (adj < cm->xlate_src_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_partial_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = ~cm->match_dest_ip + cm->xlate_dest_ip; + if (adj < cm->xlate_dest_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_partial_csum_adjustment = (u16)adj; + } + +} + +/* + * sfe_ipv4_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, + struct sfe_ipv4_connection_match *cm) +{ + struct sfe_ipv4_connection_match **hash_head; + struct sfe_ipv4_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; + +#ifdef CONFIG_NF_FLOW_COOKIE + if (!si->flow_cookie_enable) + return; + + /* + * Configure hardware to put a flow cookie in packet of this flow, + * then we can accelerate the lookup process when we received this packet. + */ + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { + flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + if (!func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port, conn_match_idx)) { + entry->match = cm; + cm->flow_cookie = conn_match_idx; + } + } + rcu_read_unlock(); + + break; + } + } +#endif +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm) +{ +#ifdef CONFIG_NF_FLOW_COOKIE + if (si->flow_cookie_enable) { + /* + * Tell hardware that we no longer need a flow cookie in packet of this flow + */ + unsigned int conn_match_idx; + + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if (cm == entry->match) { + flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + func(cm->match_protocol, cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port, 0); + } + rcu_read_unlock(); + + cm->flow_cookie = 0; + entry->match = NULL; + entry->last_clean_time = jiffies; + break; + } + } + } +#endif + + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv4_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_hash(u8 protocol, __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + u32 hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection() + * Get the IPv4 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, u32 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection *c; + unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (c->src_ip == src_ip) + && (c->dest_ip == dest_ip) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || c->src_ip != src_ip + || c->dest_ip != dest_ip + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv4_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv4_find_sfe_ipv4_connection(si, mark->protocol, + mark->src_ip.ip, mark->src_port, + mark->dest_ip.ip, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + struct sfe_ipv4_connection **hash_head; + struct sfe_ipv4_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match); + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection() + * Remove a sfe_ipv4_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match); + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv4_sync_sfe_ipv4_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 0; + sis->protocol = c->protocol; + sis->src_ip.ip = c->src_ip; + sis->src_ip_xlate.ip = c->src_ip_xlate; + sis->dest_ip.ip = c->dest_ip; + sis->dest_ip_xlate.ip = c->dest_ip_xlate; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv4_flush_sfe_ipv4_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, + struct sfe_ipv4_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv4_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_udp_hdr *udph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + u8 ttl; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_udp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and udph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv4_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv4_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_tcp_hdr *tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + u8 ttl; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_tcp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcp_hdr))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv4_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV4_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and tcph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl) +{ + struct icmphdr *icmph; + struct sfe_ipv4_ip_hdr *icmp_iph; + unsigned int icmp_ihl_words; + unsigned int icmp_ihl; + u32 *icmp_trans_h; + struct sfe_ipv4_udp_hdr *icmp_udph; + struct sfe_ipv4_tcp_hdr *icmp_tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection *c; + u32 pull_len = sizeof(struct icmphdr) + ihl; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmphdr *)(skb->data + ihl); + if ((icmph->type != ICMP_DEST_UNREACH) + && (icmph->type != ICMP_TIME_EXCEEDED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type); + return 0; + } + + /* + * Do we have the full embedded IP header? + */ + len -= sizeof(struct icmphdr); + pull_len += sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv4_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + /* + * Do we have the full embedded IP header, including any options? + */ + icmp_ihl_words = icmp_iph->ihl; + icmp_ihl = icmp_ihl_words << 2; + pull_len += icmp_ihl - sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded header not large enough for IP options\n"); + return 0; + } + + len -= icmp_ihl; + icmp_trans_h = ((u32 *)icmp_iph) + icmp_ihl_words; + + /* + * Handle the embedded transport layer header. + */ + switch (icmp_iph->protocol) { + case IPPROTO_UDP: + /* + * We should have 8 bytes of UDP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded UDP header\n"); + return 0; + } + + icmp_udph = (struct sfe_ipv4_udp_hdr *)icmp_trans_h; + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + /* + * We should have 8 bytes of TCP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded TCP header\n"); + return 0; + } + + icmp_tcph = (struct sfe_ipv4_tcp_hdr *)icmp_trans_h; + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol); + return 0; + } + + src_ip = icmp_iph->saddr; + dest_ip = icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv4_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv4 *si = &__si; + unsigned int len; + unsigned int tot_len; + unsigned int frag_off; + unsigned int ihl; + bool flush_on_find; + bool ip_options; + struct sfe_ipv4_ip_hdr *iph; + u32 protocol; + + /* + * Check that we have space for an IP header here. + */ + len = skb->len; + if (unlikely(!pskb_may_pull(skb, sizeof(struct sfe_ipv4_ip_hdr)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Check that our "total length" is large enough for an IP header. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tot_len = ntohs(iph->tot_len); + if (unlikely(tot_len < sizeof(struct sfe_ipv4_ip_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u is too short\n", tot_len); + return 0; + } + + /* + * Is our IP version wrong? + */ + if (unlikely(iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + if (unlikely(tot_len > len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len); + return 0; + } + + /* + * Do we have a non-initial fragment? + */ + frag_off = ntohs(iph->frag_off); + if (unlikely(frag_off & IP_OFFSET)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + + /* + * If we have a (first) fragment then mark it to cause any connection to flush. + */ + flush_on_find = unlikely(frag_off & IP_MF) ? true : false; + + /* + * Do we have any IP options? That's definite a slow path! If we do have IP + * options we need to recheck our header size. + */ + ihl = iph->ihl << 2; + ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_ip_hdr)) ? true : false; + if (unlikely(ip_options)) { + if (unlikely(len < ihl)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl); + return 0; + } + + flush_on_find = true; + } + + protocol = iph->protocol; + if (IPPROTO_UDP == protocol) { + return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == protocol) { + return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMP == protocol) { + return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol); + return 0; +} + +static void +sfe_ipv4_update_tcp_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection_match *orig_cm; + struct sfe_ipv4_connection_match *repl_cm; + struct sfe_ipv4_tcp_connection_match *orig_tcp; + struct sfe_ipv4_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +static void +sfe_ipv4_update_protocol_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv4_update_tcp_state(c, sic); + break; + } +} + +void sfe_ipv4_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + sfe_ipv4_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv4_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv4_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pxM:%pI4:%u, d: %s:%pxM:%pI4:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, &sic->src_ip.ip, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, &sic->dest_ip.ip, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip = sic->src_ip.ip; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip = sic->dest_ip.ip; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip = sic->src_ip_xlate.ip; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip = sic->dest_ip_xlate.ip; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV4_DSCP_SHIFT; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + original_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip = sic->dest_ip_xlate.ip; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip = sic->src_ip_xlate.ip; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip = sic->dest_ip.ip; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip = sic->src_ip.ip; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV4_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + reply_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (sic->dest_ip.ip != sic->dest_ip_xlate.ip || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (sic->src_ip.ip != sic->src_ip_xlate.ip || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip = sic->src_ip.ip; + c->src_ip_xlate = sic->src_ip_xlate.ip; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip = sic->dest_ip.ip; + c->dest_ip_xlate = sic->dest_ip_xlate.ip; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv4_connection_match_compute_translations(original_cm); + sfe_ipv4_connection_match_compute_translations(reply_cm); + sfe_ipv4_insert_sfe_ipv4_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pxM(%pxM):%pI4(%pI4):%u(%u)\n" + " d: %s:%pxM(%pxM):%pI4(%pI4):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + &sic->src_ip.ip, &sic->src_ip_xlate.ip, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + &sic->dest_ip.ip, &sic->dest_ip_xlate.ip, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv4_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip.ip, sid->src_port, + sid->dest_ip.ip, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip, ntohs(sid->src_port), + &sid->dest_ip, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip.ip, ntohs(sid->src_port), + &sid->dest_ip.ip, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv4_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_get_debug_dev() + */ +static ssize_t sfe_ipv4_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv4_get_debug_dev, NULL); + +/* + * sfe_ipv4_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv4_periodic_sync() + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) +static void sfe_ipv4_periodic_sync(unsigned long arg) +#else +static void sfe_ipv4_periodic_sync(struct timer_list *tl) +#endif +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg; +#else + struct sfe_ipv4 *si = from_timer(si, tl, timer); +#endif + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + struct sfe_ipv4_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * sfe_ipv4_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + __be32 src_ip; + __be32 src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + __be32 dest_ip; + __be32 dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; +#ifdef CONFIG_NF_FLOW_COOKIE + int src_flow_cookie, dst_flow_cookie; +#endif + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip; + src_ip_xlate = c->src_ip_xlate; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip; + dest_ip_xlate = c->dest_ip_xlate; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie = original_cm->flow_cookie; + dst_flow_cookie = reply_cm->flow_cookie; +#endif + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie, dst_flow_cookie, +#endif + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv4_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = { + sfe_ipv4_debug_dev_read_start, + sfe_ipv4_debug_dev_read_connections_start, + sfe_ipv4_debug_dev_read_connections_connection, + sfe_ipv4_debug_dev_read_connections_end, + sfe_ipv4_debug_dev_read_exceptions_start, + sfe_ipv4_debug_dev_read_exceptions_exception, + sfe_ipv4_debug_dev_read_exceptions_end, + sfe_ipv4_debug_dev_read_stats, + sfe_ipv4_debug_dev_read_end, +}; + +/* + * sfe_ipv4_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv4_debug_xml_write_state *ws; + struct sfe_ipv4 *si = &__si; + + ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv4_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv4_debug_dev_open() + */ +static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (!ws) { + ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV4_DEBUG_XML_STATE_START; + file->private_data = ws; + } + + return 0; +} + +/* + * sfe_ipv4_debug_dev_release() + */ +static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv4_debug_dev_fops = { + .read = sfe_ipv4_debug_dev_read, + .write = sfe_ipv4_debug_dev_write, + .open = sfe_ipv4_debug_dev_open, + .release = sfe_ipv4_debug_dev_release +}; + +#ifdef CONFIG_NF_FLOW_COOKIE +/* + * sfe_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_register_flow_cookie_cb(flow_cookie_set_func_t cb) +{ + struct sfe_ipv4 *si = &__si; + + BUG_ON(!cb); + + if (si->flow_cookie_set_func) { + return -1; + } + + rcu_assign_pointer(si->flow_cookie_set_func, cb); + return 0; +} + +/* + * sfe_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_unregister_flow_cookie_cb(flow_cookie_set_func_t cb) +{ + struct sfe_ipv4 *si = &__si; + + RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); + return 0; +} + +/* + * sfe_ipv4_get_flow_cookie() + */ +static ssize_t sfe_ipv4_get_flow_cookie(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); +} + +/* + * sfe_ipv4_set_flow_cookie() + */ +static ssize_t sfe_ipv4_set_flow_cookie(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct sfe_ipv4 *si = &__si; + strict_strtol(buf, 0, (long int *)&si->flow_cookie_enable); + + return size; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_flow_cookie_attr = + __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv4_get_flow_cookie, sfe_ipv4_set_flow_cookie); +#endif /*CONFIG_NF_FLOW_COOKIE*/ + +/* + * sfe_ipv4_init() + */ +static int __init sfe_ipv4_init(void) +{ + struct sfe_ipv4 *si = &__si; + int result = -1; + + DEBUG_INFO("SFE IPv4 init\n"); + + /* + * Create sys/sfe_ipv4 + */ + si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL); + if (!si->sys_sfe_ipv4) { + DEBUG_ERROR("failed to register sfe_ipv4\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + +#ifdef CONFIG_NF_FLOW_COOKIE + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); + if (result) { + DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); + goto exit3; + } +#endif /* CONFIG_NF_FLOW_COOKIE */ + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit4; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si); +#else + timer_setup(&si->timer, sfe_ipv4_periodic_sync, 0); +#endif + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit4: +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); + +exit3: +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv4); + +exit1: + return result; +} + +/* + * sfe_ipv4_exit() + */ +static void __exit sfe_ipv4_exit(void) +{ + struct sfe_ipv4 *si = &__si; + + DEBUG_INFO("SFE IPv4 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv4"); + +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_flow_cookie_attr.attr); +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv4); + +} + +module_init(sfe_ipv4_init) +module_exit(sfe_ipv4_exit) + +EXPORT_SYMBOL(sfe_ipv4_recv); +EXPORT_SYMBOL(sfe_ipv4_create_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv4_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv4_mark_rule); +EXPORT_SYMBOL(sfe_ipv4_update_rule); +#ifdef CONFIG_NF_FLOW_COOKIE +EXPORT_SYMBOL(sfe_register_flow_cookie_cb); +EXPORT_SYMBOL(sfe_unregister_flow_cookie_cb); +#endif + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/shortcut-fe/sfe_ipv6.c b/shortcut-fe/shortcut-fe/sfe_ipv6.c new file mode 100644 index 000000000..a7cb811a9 --- /dev/null +++ b/shortcut-fe/shortcut-fe/sfe_ipv6.c @@ -0,0 +1,3617 @@ +/* + * sfe_ipv6.c + * Shortcut forwarding engine - IPv6 support. + * + * Copyright (c) 2015-2016, 2019-2020 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV6_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV6_UNALIGNED_IP_HEADER 1 +#if SFE_IPV6_UNALIGNED_IP_HEADER +#define SFE_IPV6_UNALIGNED_STRUCT __attribute__((packed)) +#else +#define SFE_IPV6_UNALIGNED_STRUCT +#endif + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_DSCP_MASK 0xf03f +#define SFE_IPV6_DSCP_SHIFT 2 + +/* + * An IPv6 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 priority:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + priority:4; +#else +#error "Please fix " +#endif + __u8 flow_lbl[3]; + __be16 payload_len; + __u8 nexthdr; + __u8 hop_limit; + struct sfe_ipv6_addr saddr; + struct sfe_ipv6_addr daddr; + + /* + * The extension header start here. + */ +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_EXT_HDR_HOP 0 +#define SFE_IPV6_EXT_HDR_ROUTING 43 +#define SFE_IPV6_EXT_HDR_FRAG 44 +#define SFE_IPV6_EXT_HDR_ESP 50 +#define SFE_IPV6_EXT_HDR_AH 51 +#define SFE_IPV6_EXT_HDR_NONE 59 +#define SFE_IPV6_EXT_HDR_DST 60 +#define SFE_IPV6_EXT_HDR_MH 135 + +/* + * fragmentation header + */ + +struct sfe_ipv6_frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; +}; + +#define SFE_IPV6_FRAG_OFFSET 0xfff8 + +/* + * generic IPv6 extension header + */ +struct sfe_ipv6_ext_hdr { + __u8 next_hdr; + __u8 hdr_len; + __u8 padding[6]; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV6_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv6 TCP connection match additional data. + */ +struct sfe_ipv6_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv6 connection matching entry. + */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv6 connection matching structure. + */ +struct sfe_ipv6_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv6_connection_match *next; + struct sfe_ipv6_connection_match *prev; + struct sfe_ipv6_connection *connection; + struct sfe_ipv6_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in connection */ + struct sfe_ipv6_connection_match *active_next; + struct sfe_ipv6_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + struct sfe_ipv6_addr match_src_ip[1]; /* Source IP address */ + struct sfe_ipv6_addr match_dest_ip[1]; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_NF_FLOW_COOKIE + u32 flow_cookie; /* used flow cookie, for debug */ +#endif +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv6_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + struct sfe_ipv6_addr xlate_src_ip[1]; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + struct sfe_ipv6_addr xlate_dest_ip[1]; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv6_connection { + struct sfe_ipv6_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv6_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + struct sfe_ipv6_addr src_ip[1]; /* Src IP addr pre-translation */ + struct sfe_ipv6_addr src_ip_xlate[1]; /* Src IP addr post-translation */ + struct sfe_ipv6_addr dest_ip[1]; /* Dest IP addr pre-translation */ + struct sfe_ipv6_addr dest_ip_xlate[1]; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv6_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv6_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv6_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv6_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv6 connections and hash table size information. + */ +#define SFE_IPV6_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV6_CONNECTION_HASH_SIZE (1 << SFE_IPV6_CONNECTION_HASH_SHIFT) +#define SFE_IPV6_CONNECTION_HASH_MASK (SFE_IPV6_CONNECTION_HASH_SIZE - 1) + +#ifdef CONFIG_NF_FLOW_COOKIE +#define SFE_FLOW_COOKIE_SIZE 2048 +#define SFE_FLOW_COOKIE_MASK 0x7ff + +struct sfe_ipv6_flow_cookie_entry { + struct sfe_ipv6_connection_match *match; + unsigned long last_clean_time; +}; +#endif + +enum sfe_ipv6_exception_events { + SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV6_EXCEPTION_EVENT_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL, + SFE_IPV6_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv6_exception_events_string[SFE_IPV6_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV6_HEADER_INCOMPLETE", + "ICMP_IPV6_NON_V6", + "ICMP_IPV6_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV6_UDP_HEADER_INCOMPLETE", + "ICMP_IPV6_TCP_HEADER_INCOMPLETE", + "ICMP_IPV6_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V6", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL", + "FLOW_COOKIE_ADD_FAIL" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv6 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv6_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv6_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv6_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv6_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv6_connection *conn_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv6_connection_match *conn_match_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ +#ifdef CONFIG_NF_FLOW_COOKIE + struct sfe_ipv6_flow_cookie_entry sfe_flow_cookie_table[SFE_FLOW_COOKIE_SIZE]; + /* flow cookie table*/ + sfe_ipv6_flow_cookie_set_func_t flow_cookie_set_func; + /* function used to configure flow cookie in hardware*/ + int flow_cookie_enable; + /* Enable/disable flow cookie at runtime */ +#endif + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv6 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv6 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv6 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv6 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv6 connection flushes */ + u32 packets_forwarded; /* Number of IPv6 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv6 packets not forwarded */ + u32 exception_events[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv6 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv6 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv6 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv6 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv6 connection flushes */ + u64 packets_forwarded64; /* Number of IPv6 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv6 packets not forwarded */ + u64 exception_events64[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv6; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv6_debug_xml_states { + SFE_IPV6_DEBUG_XML_STATE_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_STATS, + SFE_IPV6_DEBUG_XML_STATE_END, + SFE_IPV6_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv6_debug_xml_write_state { + enum sfe_ipv6_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv6_debug_xml_write_method_t)(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws); + +static struct sfe_ipv6 __si6; + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, struct device_attribute *attr, char *buf); + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv6_get_debug_dev, NULL); + +/* + * sfe_ipv6_is_ext_hdr() + * check if we recognize ipv6 extension header + */ +static inline bool sfe_ipv6_is_ext_hdr(u8 hdr) +{ + return (hdr == SFE_IPV6_EXT_HDR_HOP) || + (hdr == SFE_IPV6_EXT_HDR_ROUTING) || + (hdr == SFE_IPV6_EXT_HDR_FRAG) || + (hdr == SFE_IPV6_EXT_HDR_AH) || + (hdr == SFE_IPV6_EXT_HDR_DST) || + (hdr == SFE_IPV6_EXT_HDR_MH); +} + +/* + * sfe_ipv6_change_dsfield() + * change dscp field in IPv6 packet + */ +static inline void sfe_ipv6_change_dsfield(struct sfe_ipv6_ip_hdr *iph, u8 dscp) +{ + __be16 *p = (__be16 *)iph; + + *p = ((*p & htons(SFE_IPV6_DSCP_MASK)) | htons((u16)dscp << 4)); +} + +/* + * sfe_ipv6_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_match_hash(struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + size_t dev_addr = (size_t)dev; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = ((u32)dev_addr) ^ hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection_match() + * Get the IPv6 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv6_connection_match * +sfe_ipv6_find_connection_match(struct sfe_ipv6 *si, struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv6_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (sfe_ipv6_addr_equal(cm->match_src_ip, src_ip)) + && (sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip)) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || !sfe_ipv6_addr_equal(cm->match_src_ip, src_ip) + || !sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip) + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv6_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv6_connection_match_update_summary_stats(struct sfe_ipv6_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv6_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv6_connection_match_compute_translations(struct sfe_ipv6_connection_match *cm) +{ + u32 diff[9]; + u32 *idx_32; + u16 *idx_16; + + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_src_ip->addr[0]; + *(idx_32++) = cm->match_src_ip->addr[1]; + *(idx_32++) = cm->match_src_ip->addr[2]; + *(idx_32++) = cm->match_src_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_src_port; + *(idx_16++) = ~cm->xlate_src_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_src_ip->addr[0]; + *(idx_32++) = ~cm->xlate_src_ip->addr[1]; + *(idx_32++) = ~cm->xlate_src_ip->addr[2]; + *(idx_32++) = ~cm->xlate_src_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_dest_ip->addr[0]; + *(idx_32++) = cm->match_dest_ip->addr[1]; + *(idx_32++) = cm->match_dest_ip->addr[2]; + *(idx_32++) = cm->match_dest_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_dest_port; + *(idx_16++) = ~cm->xlate_dest_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_dest_ip->addr[0]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[1]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[2]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } +} + +/* + * sfe_ipv6_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv6_update_summary_stats(struct sfe_ipv6 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV6_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv6_insert_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_insert_connection_match(struct sfe_ipv6 *si, + struct sfe_ipv6_connection_match *cm) +{ + struct sfe_ipv6_connection_match **hash_head; + struct sfe_ipv6_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; + +#ifdef CONFIG_NF_FLOW_COOKIE + if (!si->flow_cookie_enable || !(cm->flags & (SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC | SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST))) + return; + + /* + * Configure hardware to put a flow cookie in packet of this flow, + * then we can accelerate the lookup process when we received this packet. + */ + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if ((NULL == entry->match) && time_is_before_jiffies(entry->last_clean_time + HZ)) { + sfe_ipv6_flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + if (!func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, + cm->match_dest_ip->addr, cm->match_dest_port, conn_match_idx)) { + entry->match = cm; + cm->flow_cookie = conn_match_idx; + } else { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL]++; + } + } + rcu_read_unlock(); + + break; + } + } +#endif +} + +/* + * sfe_ipv6_remove_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_remove_connection_match(struct sfe_ipv6 *si, struct sfe_ipv6_connection_match *cm) +{ +#ifdef CONFIG_NF_FLOW_COOKIE + if (si->flow_cookie_enable) { + /* + * Tell hardware that we no longer need a flow cookie in packet of this flow + */ + unsigned int conn_match_idx; + + for (conn_match_idx = 1; conn_match_idx < SFE_FLOW_COOKIE_SIZE; conn_match_idx++) { + struct sfe_ipv6_flow_cookie_entry *entry = &si->sfe_flow_cookie_table[conn_match_idx]; + + if (cm == entry->match) { + sfe_ipv6_flow_cookie_set_func_t func; + + rcu_read_lock(); + func = rcu_dereference(si->flow_cookie_set_func); + if (func) { + func(cm->match_protocol, cm->match_src_ip->addr, cm->match_src_port, + cm->match_dest_ip->addr, cm->match_dest_port, 0); + } + rcu_read_unlock(); + + cm->flow_cookie = 0; + entry->match = NULL; + entry->last_clean_time = jiffies; + break; + } + } + } +#endif + + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv6_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_hash(u8 protocol, struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection() + * Get the IPv6 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv6_connection *sfe_ipv6_find_connection(struct sfe_ipv6 *si, u32 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection *c; + unsigned int conn_idx = sfe_ipv6_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (sfe_ipv6_addr_equal(c->src_ip, src_ip)) + && (sfe_ipv6_addr_equal(c->dest_ip, dest_ip)) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || !sfe_ipv6_addr_equal(c->src_ip, src_ip) + || !sfe_ipv6_addr_equal(c->dest_ip, dest_ip) + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv6_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv6_find_connection(si, mark->protocol, + mark->src_ip.ip6, mark->src_port, + mark->dest_ip.ip6, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv6_insert_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_insert_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + struct sfe_ipv6_connection **hash_head; + struct sfe_ipv6_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv6_insert_connection_match(si, c->original_match); + sfe_ipv6_insert_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv6_remove_connection() + * Remove a sfe_ipv6_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_remove_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv6_remove_connection_match(si, c->reply_match); + sfe_ipv6_remove_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv6_gen_sync_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv6_gen_sync_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 1; + sis->protocol = c->protocol; + sis->src_ip.ip6[0] = c->src_ip[0]; + sis->src_ip_xlate.ip6[0] = c->src_ip_xlate[0]; + sis->dest_ip.ip6[0] = c->dest_ip[0]; + sis->dest_ip_xlate.ip6[0] = c->dest_ip_xlate[0]; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv6_flush_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv6_flush_connection(struct sfe_ipv6 *si, + struct sfe_ipv6_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv6_gen_sync_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv6_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv6_recv_udp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_udp_hdr *udph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_udp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and udph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip[0]; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip[0]; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv6_process_tcp_option_sack(const struct sfe_ipv6_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv6_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv6_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv6_recv_tcp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_tcp_hdr *tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_tcp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ +#ifdef CONFIG_NF_FLOW_COOKIE + cm = si->sfe_flow_cookie_table[skb->flow_cookie & SFE_FLOW_COOKIE_MASK].match; + if (unlikely(!cm)) { + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + } +#else + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); +#endif + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv6_tcp_hdr))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv6_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv6_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV6_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Check if skb was cloned. If it was, unshare it. Because + * the data area is going to be written in this path and we don't want to + * change the cloned skb's data section. + */ + if (unlikely(skb_cloned(skb))) { + DEBUG_TRACE("%px: skb is a cloned skb\n", skb); + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + DEBUG_WARN("Failed to unshare the cloned skb\n"); + return 0; + } + + /* + * Update the iph and tcph pointers with the unshared skb's data area. + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + } + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip[0]; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip[0]; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv6_recv_icmp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl) +{ + struct icmp6hdr *icmph; + struct sfe_ipv6_ip_hdr *icmp_iph; + struct sfe_ipv6_udp_hdr *icmp_udph; + struct sfe_ipv6_tcp_hdr *icmp_tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection *c; + u8 next_hdr; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, ihl + sizeof(struct icmp6hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmp6hdr *)(skb->data + ihl); + if ((icmph->icmp6_type != ICMPV6_DEST_UNREACH) + && (icmph->icmp6_type != ICMPV6_TIME_EXCEED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->icmp6_type); + return 0; + } + + /* + * Do we have the full embedded IP header? + * We should have 8 bytes of next L4 header - that's enough to identify + * the connection. + */ + len -= sizeof(struct icmp6hdr); + ihl += sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ip_hdr) + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv6_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + len -= sizeof(struct sfe_ipv6_ip_hdr); + ihl += sizeof(struct sfe_ipv6_ip_hdr); + next_hdr = icmp_iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + len -= ext_hdr_len; + ihl += ext_hdr_len; + /* + * We should have 8 bytes of next header - that's enough to identify + * the connection. + */ + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + next_hdr = ext_hdr->next_hdr; + } + + /* + * Handle the embedded transport layer header. + */ + switch (next_hdr) { + case IPPROTO_UDP: + icmp_udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + icmp_tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", next_hdr); + return 0; + } + + src_ip = &icmp_iph->saddr; + dest_ip = &icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv6_find_connection_match(si, dev, icmp_iph->nexthdr, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv6_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv6 *si = &__si6; + unsigned int len; + unsigned int payload_len; + unsigned int ihl = sizeof(struct sfe_ipv6_ip_hdr); + bool flush_on_find = false; + struct sfe_ipv6_ip_hdr *iph; + u8 next_hdr; + + /* + * Check that we have space for an IP header and an uplayer header here. + */ + len = skb->len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Is our IP version wrong? + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + if (unlikely(iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + payload_len = ntohs(iph->payload_len); + if (unlikely(payload_len > (len - ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("payload_len: %u, exceeds len: %u\n", payload_len, (len - (unsigned int)sizeof(struct sfe_ipv6_ip_hdr))); + return 0; + } + + next_hdr = iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + ihl += ext_hdr_len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + flush_on_find = true; + next_hdr = ext_hdr->next_hdr; + } + + if (IPPROTO_UDP == next_hdr) { + return sfe_ipv6_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == next_hdr) { + return sfe_ipv6_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMPV6 == next_hdr) { + return sfe_ipv6_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", next_hdr); + return 0; +} + +/* + * sfe_ipv6_update_tcp_state() + * update TCP window variables. + */ +static void +sfe_ipv6_update_tcp_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection_match *orig_cm; + struct sfe_ipv6_connection_match *repl_cm; + struct sfe_ipv6_tcp_connection_match *orig_tcp; + struct sfe_ipv6_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +/* + * sfe_ipv6_update_protocol_state() + * update protocol specified state machine. + */ +static void +sfe_ipv6_update_protocol_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv6_update_tcp_state(c, sic); + break; + } +} + +/* + * sfe_ipv6_update_rule() + * update forwarding rule after rule is created. + */ +void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + sfe_ipv6_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv6_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pxM:%pI6:%u, d: %s:%pxM:%pI6:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_ip.ip6, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, sic->dest_ip.ip6, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv6_connection *)kmalloc(sizeof(struct sfe_ipv6_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip[0] = sic->src_ip.ip6[0]; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip[0] = sic->dest_ip.ip6[0]; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip[0] = sic->src_ip_xlate.ip6[0]; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip[0] = sic->dest_ip_xlate.ip6[0]; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV6_DSCP_SHIFT; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + original_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip[0] = sic->dest_ip_xlate.ip6[0]; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip[0] = sic->src_ip_xlate.ip6[0]; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip[0] = sic->dest_ip.ip6[0]; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip[0] = sic->src_ip.ip6[0]; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV6_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_NF_FLOW_COOKIE + reply_cm->flow_cookie = 0; +#endif +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (!sfe_ipv6_addr_equal(sic->dest_ip.ip6, sic->dest_ip_xlate.ip6) || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (!sfe_ipv6_addr_equal(sic->src_ip.ip6, sic->src_ip_xlate.ip6) || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip[0] = sic->src_ip.ip6[0]; + c->src_ip_xlate[0] = sic->src_ip_xlate.ip6[0]; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip[0] = sic->dest_ip.ip6[0]; + c->dest_ip_xlate[0] = sic->dest_ip_xlate.ip6[0]; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv6_connection_match_compute_translations(original_cm); + sfe_ipv6_connection_match_compute_translations(reply_cm); + sfe_ipv6_insert_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n" + " d: %s:%pxM(%pxM):%pI6(%pI6):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + sic->src_ip.ip6, sic->src_ip_xlate.ip6, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + sic->dest_ip.ip6, sic->dest_ip_xlate.ip6, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv6_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv6_find_connection(si, sid->protocol, sid->src_ip.ip6, sid->src_port, + sid->dest_ip.ip6, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv6_remove_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv6_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sfe_ipv6_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv6_remove_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv6_periodic_sync() + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) +static void sfe_ipv6_periodic_sync(unsigned long arg) +#else +static void sfe_ipv6_periodic_sync(struct timer_list *tl) +#endif +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + struct sfe_ipv6 *si = (struct sfe_ipv6 *)arg; +#else + struct sfe_ipv6 *si = from_timer(si, tl, timer); +#endif + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + struct sfe_ipv6_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv6_gen_sync_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +/* + * sfe_ipv6_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_connection(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + struct sfe_ipv6_addr src_ip; + struct sfe_ipv6_addr src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + struct sfe_ipv6_addr dest_ip; + struct sfe_ipv6_addr dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; +#ifdef CONFIG_NF_FLOW_COOKIE + int src_flow_cookie, dst_flow_cookie; +#endif + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip[0]; + src_ip_xlate = c->src_ip_xlate[0]; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip[0]; + dest_ip_xlate = c->dest_ip_xlate[0]; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie = original_cm->flow_cookie; + dst_flow_cookie = reply_cm->flow_cookie; +#endif + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, +#ifdef CONFIG_NF_FLOW_COOKIE + src_flow_cookie, dst_flow_cookie, +#endif + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_exception(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv6_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV6_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_stats(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv6_debug_xml_write_method_t sfe_ipv6_debug_xml_write_methods[SFE_IPV6_DEBUG_XML_STATE_DONE] = { + sfe_ipv6_debug_dev_read_start, + sfe_ipv6_debug_dev_read_connections_start, + sfe_ipv6_debug_dev_read_connections_connection, + sfe_ipv6_debug_dev_read_connections_end, + sfe_ipv6_debug_dev_read_exceptions_start, + sfe_ipv6_debug_dev_read_exceptions_exception, + sfe_ipv6_debug_dev_read_exceptions_end, + sfe_ipv6_debug_dev_read_stats, + sfe_ipv6_debug_dev_read_end, +}; + +/* + * sfe_ipv6_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv6_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv6_debug_xml_write_state *ws; + struct sfe_ipv6 *si = &__si6; + + ws = (struct sfe_ipv6_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV6_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv6_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv6_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv6_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv6_debug_dev_open() + */ +static int sfe_ipv6_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + return 0; + } + + ws = kzalloc(sizeof(struct sfe_ipv6_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV6_DEBUG_XML_STATE_START; + file->private_data = ws; + + return 0; +} + +/* + * sfe_ipv6_debug_dev_release() + */ +static int sfe_ipv6_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv6_debug_dev_fops = { + .read = sfe_ipv6_debug_dev_read, + .write = sfe_ipv6_debug_dev_write, + .open = sfe_ipv6_debug_dev_open, + .release = sfe_ipv6_debug_dev_release +}; + +#ifdef CONFIG_NF_FLOW_COOKIE +/* + * sfe_ipv6_register_flow_cookie_cb + * register a function in SFE to let SFE use this function to configure flow cookie for a flow + * + * Hardware driver which support flow cookie should register a callback function in SFE. Then SFE + * can use this function to configure flow cookie for a flow. + * return: 0, success; !=0, fail + */ +int sfe_ipv6_register_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) +{ + struct sfe_ipv6 *si = &__si6; + + BUG_ON(!cb); + + if (si->flow_cookie_set_func) { + return -1; + } + + rcu_assign_pointer(si->flow_cookie_set_func, cb); + return 0; +} + +/* + * sfe_ipv6_unregister_flow_cookie_cb + * unregister function which is used to configure flow cookie for a flow + * + * return: 0, success; !=0, fail + */ +int sfe_ipv6_unregister_flow_cookie_cb(sfe_ipv6_flow_cookie_set_func_t cb) +{ + struct sfe_ipv6 *si = &__si6; + + RCU_INIT_POINTER(si->flow_cookie_set_func, NULL); + return 0; +} + +/* + * sfe_ipv6_get_flow_cookie() + */ +static ssize_t sfe_ipv6_get_flow_cookie(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", si->flow_cookie_enable); +} + +/* + * sfe_ipv6_set_flow_cookie() + */ +static ssize_t sfe_ipv6_set_flow_cookie(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct sfe_ipv6 *si = &__si6; + strict_strtol(buf, 0, (long int *)&si->flow_cookie_enable); + + return size; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_flow_cookie_attr = + __ATTR(flow_cookie_enable, S_IWUSR | S_IRUGO, sfe_ipv6_get_flow_cookie, sfe_ipv6_set_flow_cookie); +#endif /*CONFIG_NF_FLOW_COOKIE*/ + +/* + * sfe_ipv6_init() + */ +static int __init sfe_ipv6_init(void) +{ + struct sfe_ipv6 *si = &__si6; + int result = -1; + + DEBUG_INFO("SFE IPv6 init\n"); + + /* + * Create sys/sfe_ipv6 + */ + si->sys_sfe_ipv6 = kobject_create_and_add("sfe_ipv6", NULL); + if (!si->sys_sfe_ipv6) { + DEBUG_ERROR("failed to register sfe_ipv6\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + +#ifdef CONFIG_NF_FLOW_COOKIE + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); + if (result) { + DEBUG_ERROR("failed to register flow cookie enable file: %d\n", result); + goto exit3; + } +#endif /* CONFIG_NF_FLOW_COOKIE */ + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv6", &sfe_ipv6_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit4; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)) + setup_timer(&si->timer, sfe_ipv6_periodic_sync, (unsigned long)si); +#else + timer_setup(&si->timer, sfe_ipv6_periodic_sync, 0); +#endif + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit4: +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); + +exit3: +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv6); + +exit1: + return result; +} + +/* + * sfe_ipv6_exit() + */ +static void __exit sfe_ipv6_exit(void) +{ + struct sfe_ipv6 *si = &__si6; + + DEBUG_INFO("SFE IPv6 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv6_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv6"); + +#ifdef CONFIG_NF_FLOW_COOKIE + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_flow_cookie_attr.attr); +#endif /* CONFIG_NF_FLOW_COOKIE */ + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv6); +} + +module_init(sfe_ipv6_init) +module_exit(sfe_ipv6_exit) + +EXPORT_SYMBOL(sfe_ipv6_recv); +EXPORT_SYMBOL(sfe_ipv6_create_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv6_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv6_mark_rule); +EXPORT_SYMBOL(sfe_ipv6_update_rule); +#ifdef CONFIG_NF_FLOW_COOKIE +EXPORT_SYMBOL(sfe_ipv6_register_flow_cookie_cb); +EXPORT_SYMBOL(sfe_ipv6_unregister_flow_cookie_cb); +#endif + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv6 support"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/simulated-driver/sfe_drv.c b/shortcut-fe/simulated-driver/sfe_drv.c new file mode 100644 index 000000000..98677d911 --- /dev/null +++ b/shortcut-fe/simulated-driver/sfe_drv.c @@ -0,0 +1,1323 @@ +/* + * sfe_drv.c + * simulated sfe driver for shortcut forwarding engine. + * + * Copyright (c) 2015,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "../shortcut-fe/sfe.h" +#include "../shortcut-fe/sfe_cm.h" +#include "sfe_drv.h" + +typedef enum sfe_drv_exception { + SFE_DRV_EXCEPTION_IPV4_MSG_UNKNOW, + SFE_DRV_EXCEPTION_IPV6_MSG_UNKNOW, + SFE_DRV_EXCEPTION_CONNECTION_INVALID, + SFE_DRV_EXCEPTION_NOT_SUPPORT_BRIDGE, + SFE_DRV_EXCEPTION_TCP_INVALID, + SFE_DRV_EXCEPTION_PROTOCOL_NOT_SUPPORT, + SFE_DRV_EXCEPTION_SRC_DEV_NOT_L3, + SFE_DRV_EXCEPTION_DEST_DEV_NOT_L3, + SFE_DRV_EXCEPTION_CREATE_FAILED, + SFE_DRV_EXCEPTION_ENQUEUE_FAILED, + SFE_DRV_EXCEPTION_NOT_SUPPORT_6RD, + SFE_DRV_EXCEPTION_NO_SYNC_CB, + SFE_DRV_EXCEPTION_MAX +} sfe_drv_exception_t; + +static char *sfe_drv_exception_events_string[SFE_DRV_EXCEPTION_MAX] = { + "IPV4_MSG_UNKNOW", + "IPV6_MSG_UNKNOW", + "CONNECTION_INVALID", + "NOT_SUPPORT_BRIDGE", + "TCP_INVALID", + "PROTOCOL_NOT_SUPPORT", + "SRC_DEV_NOT_L3", + "DEST_DEV_NOT_L3", + "CREATE_FAILED", + "ENQUEUE_FAILED", + "NOT_SUPPORT_6RD", + "NO_SYNC_CB" +}; + +#define SFE_MESSAGE_VERSION 0x1 +#define SFE_MAX_CONNECTION_NUM 65535 +#define sfe_drv_ipv6_addr_copy(src, dest) memcpy((void *)(dest), (void *)(src), 16) +#define sfe_drv_ipv4_stopped(CTX) (rcu_dereference((CTX)->ipv4_stats_sync_cb) == NULL) +#define sfe_drv_ipv6_stopped(CTX) (rcu_dereference((CTX)->ipv6_stats_sync_cb) == NULL) + +/* + * message type of queued response message + */ +typedef enum { + SFE_DRV_MSG_TYPE_IPV4, + SFE_DRV_MSG_TYPE_IPV6 +} sfe_drv_msg_types_t; + +/* + * queued response message, + * will be sent back to caller in workqueue + */ +struct sfe_drv_response_msg { + struct list_head node; + sfe_drv_msg_types_t type; + void *msg[0]; +}; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#endif + +/* + * sfe driver context instance, private for sfe driver + */ +struct sfe_drv_ctx_instance_internal { + struct sfe_drv_ctx_instance base;/* exported sfe driver context, is public to user of sfe driver*/ + + /* + * Control state. + */ + struct kobject *sys_sfe_drv; /* sysfs linkage */ + + struct list_head msg_queue; /* response message queue*/ + spinlock_t lock; /* Lock to protect message queue */ + + struct work_struct work; /* work to send response message back to caller*/ + + sfe_ipv4_msg_callback_t __rcu ipv4_stats_sync_cb; /* callback to call to sync ipv4 statistics */ + void *ipv4_stats_sync_data; /* argument for above callback: ipv4_stats_sync_cb */ + + sfe_ipv6_msg_callback_t __rcu ipv6_stats_sync_cb; /* callback to call to sync ipv6 statistics */ + void *ipv6_stats_sync_data; /* argument for above callback: ipv6_stats_sync_cb */ + + u32 exceptions[SFE_DRV_EXCEPTION_MAX]; /* statistics for exception */ +}; + +static struct sfe_drv_ctx_instance_internal __sfe_drv_ctx; + +/* + * convert public sfe driver context to internal context + */ +#define SFE_DRV_CTX_TO_PRIVATE(base) (struct sfe_drv_ctx_instance_internal *)(base) +/* + * convert internal sfe driver context to public context + */ +#define SFE_DRV_CTX_TO_PUBLIC(intrv) (struct sfe_drv_ctx_instance *)(intrv) + +/* + * sfe_drv_incr_exceptions() + * increase an exception counter. + */ +static inline void sfe_drv_incr_exceptions(sfe_drv_exception_t except) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + spin_lock_bh(&sfe_drv_ctx->lock); + sfe_drv_ctx->exceptions[except]++; + spin_unlock_bh(&sfe_drv_ctx->lock); +} + +/* + * sfe_drv_dev_is_layer_3_interface() + * check if a network device is ipv4 or ipv6 layer 3 interface + * + * @param dev network device to check + * @param check_v4 check ipv4 layer 3 interface(which have ipv4 address) or ipv6 layer 3 interface(which have ipv6 address) + */ +inline bool sfe_drv_dev_is_layer_3_interface(struct net_device *dev, bool check_v4) +{ + struct in_device *in4_dev; + struct inet6_dev *in6_dev; + + BUG_ON(!dev); + + if (likely(check_v4)) { + /* + * Does our input device support IPv4 processing? + */ + in4_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in4_dev)) { + return false; + } + + /* + * Does it have an IPv4 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in4_dev->ifa_list)) { + return false; + } + + return true; + } + + /* + * Does our input device support IPv6 processing? + */ + in6_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in6_dev)) { + return false; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in6_dev->addr_list))) { + return false; + } + + return true; +} + +/* + * sfe_drv_clean_response_msg_by_type() + * clean response message in queue when ECM exit + * + * @param sfe_drv_ctx sfe driver context + * @param msg_type message type, ipv4 or ipv6 + */ +static void sfe_drv_clean_response_msg_by_type(struct sfe_drv_ctx_instance_internal *sfe_drv_ctx, sfe_drv_msg_types_t msg_type) +{ + struct sfe_drv_response_msg *response, *tmp; + + if (!sfe_drv_ctx) { + return; + } + + spin_lock_bh(&sfe_drv_ctx->lock); + list_for_each_entry_safe(response, tmp, &sfe_drv_ctx->msg_queue, node) { + if (response->type == msg_type) { + list_del(&response->node); + /* + * free response message + */ + kfree(response); + } + } + spin_unlock_bh(&sfe_drv_ctx->lock); + +} + +/* + * sfe_drv_process_response_msg() + * send all pending response message to ECM by calling callback function included in message + * + * @param work work structure + */ +static void sfe_drv_process_response_msg(struct work_struct *work) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = container_of(work, struct sfe_drv_ctx_instance_internal, work); + struct sfe_drv_response_msg *response; + + spin_lock_bh(&sfe_drv_ctx->lock); + while ((response = list_first_entry_or_null(&sfe_drv_ctx->msg_queue, struct sfe_drv_response_msg, node))) { + list_del(&response->node); + spin_unlock_bh(&sfe_drv_ctx->lock); + rcu_read_lock(); + + /* + * send response message back to caller + */ + if ((response->type == SFE_DRV_MSG_TYPE_IPV4) && !sfe_drv_ipv4_stopped(sfe_drv_ctx)) { + struct sfe_ipv4_msg *msg = (struct sfe_ipv4_msg *)response->msg; + sfe_ipv4_msg_callback_t callback = (sfe_ipv4_msg_callback_t)msg->cm.cb; + if (callback) { + callback((void *)msg->cm.app_data, msg); + } + } else if ((response->type == SFE_DRV_MSG_TYPE_IPV6) && !sfe_drv_ipv6_stopped(sfe_drv_ctx)) { + struct sfe_ipv6_msg *msg = (struct sfe_ipv6_msg *)response->msg; + sfe_ipv6_msg_callback_t callback = (sfe_ipv6_msg_callback_t)msg->cm.cb; + if (callback) { + callback((void *)msg->cm.app_data, msg); + } + } + + rcu_read_unlock(); + /* + * free response message + */ + kfree(response); + spin_lock_bh(&sfe_drv_ctx->lock); + } + spin_unlock_bh(&sfe_drv_ctx->lock); +} + +/* + * sfe_drv_alloc_response_msg() + * alloc and construct new response message + * + * @param type message type + * @param msg used to construct response message if not NULL + * + * @return !NULL, success; NULL, failed + */ +static struct sfe_drv_response_msg * +sfe_drv_alloc_response_msg(sfe_drv_msg_types_t type, void *msg) +{ + struct sfe_drv_response_msg *response; + int size; + + switch (type) { + case SFE_DRV_MSG_TYPE_IPV4: + size = sizeof(struct sfe_ipv4_msg); + break; + case SFE_DRV_MSG_TYPE_IPV6: + size = sizeof(struct sfe_ipv6_msg); + break; + default: + DEBUG_ERROR("message type %d not supported\n", type); + return NULL; + } + + response = (struct sfe_drv_response_msg *)kzalloc(sizeof(struct sfe_drv_response_msg) + size, GFP_ATOMIC); + if (!response) { + DEBUG_ERROR("allocate memory failed\n"); + return NULL; + } + + response->type = type; + + if (msg) { + memcpy(response->msg, msg, size); + } + + return response; +} + +/* + * sfe_drv_enqueue_msg() + * queue response message + * + * @param sfe_drv_ctx sfe driver context + * @param response response message to be queue + */ +static inline void sfe_drv_enqueue_msg(struct sfe_drv_ctx_instance_internal *sfe_drv_ctx, struct sfe_drv_response_msg *response) +{ + spin_lock_bh(&sfe_drv_ctx->lock); + list_add_tail(&response->node, &sfe_drv_ctx->msg_queue); + spin_unlock_bh(&sfe_drv_ctx->lock); + + schedule_work(&sfe_drv_ctx->work); +} + +/* + * sfe_cmn_msg_init() + * Initialize the common message structure. + * + * @param ncm message to init + * @param if_num interface number related with this message + * @param type message type + * @param cb callback function to process repsonse of this message + * @param app_data argument for above callback function + */ +static void sfe_cmn_msg_init(struct sfe_cmn_msg *ncm, u16 if_num, u32 type, u32 len, void *cb, void *app_data) +{ + ncm->interface = if_num; + ncm->version = SFE_MESSAGE_VERSION; + ncm->type = type; + ncm->len = len; + ncm->cb = (sfe_ptr_t)cb; + ncm->app_data = (sfe_ptr_t)app_data; +} + +/* + * sfe_drv_ipv4_stats_sync_callback() + * Synchronize a connection's state. + * + * @param sis SFE statistics from SFE core engine + */ +static void sfe_drv_ipv4_stats_sync_callback(struct sfe_connection_sync *sis) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + struct sfe_ipv4_msg msg; + struct sfe_ipv4_conn_sync *sync_msg; + sfe_ipv4_msg_callback_t sync_cb; + + rcu_read_lock(); + sync_cb = rcu_dereference(sfe_drv_ctx->ipv4_stats_sync_cb); + if (!sync_cb) { + rcu_read_unlock(); + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_NO_SYNC_CB); + return; + } + + sync_msg = &msg.msg.conn_stats; + + memset(&msg, 0, sizeof(msg)); + sfe_cmn_msg_init(&msg.cm, 0, SFE_RX_CONN_STATS_SYNC_MSG, + sizeof(struct sfe_ipv4_conn_sync), NULL, NULL); + + /* + * fill connection specific information + */ + sync_msg->protocol = (u8)sis->protocol; + sync_msg->flow_ip = sis->src_ip.ip; + sync_msg->flow_ip_xlate = sis->src_ip_xlate.ip; + sync_msg->flow_ident = sis->src_port; + sync_msg->flow_ident_xlate = sis->src_port_xlate; + + sync_msg->return_ip = sis->dest_ip.ip; + sync_msg->return_ip_xlate = sis->dest_ip_xlate.ip; + sync_msg->return_ident = sis->dest_port; + sync_msg->return_ident_xlate = sis->dest_port_xlate; + + /* + * fill TCP protocol specific information + */ + if (sis->protocol == IPPROTO_TCP) { + sync_msg->flow_max_window = sis->src_td_max_window; + sync_msg->flow_end = sis->src_td_end; + sync_msg->flow_max_end = sis->src_td_max_end; + + sync_msg->return_max_window = sis->dest_td_max_window; + sync_msg->return_end = sis->dest_td_end; + sync_msg->return_max_end = sis->dest_td_max_end; + } + + /* + * fill statistics information + */ + sync_msg->flow_rx_packet_count = sis->src_new_packet_count; + sync_msg->flow_rx_byte_count = sis->src_new_byte_count; + sync_msg->flow_tx_packet_count = sis->dest_new_packet_count; + sync_msg->flow_tx_byte_count = sis->dest_new_byte_count; + + sync_msg->return_rx_packet_count = sis->dest_new_packet_count; + sync_msg->return_rx_byte_count = sis->dest_new_byte_count; + sync_msg->return_tx_packet_count = sis->src_new_packet_count; + sync_msg->return_tx_byte_count = sis->src_new_byte_count; + + /* + * fill expiration time to extend, in unit of msec + */ + sync_msg->inc_ticks = (((u32)sis->delta_jiffies) * MSEC_PER_SEC)/HZ; + + /* + * fill other information + */ + switch (sis->reason) { + case SFE_SYNC_REASON_DESTROY: + sync_msg->reason = SFE_RULE_SYNC_REASON_DESTROY; + break; + case SFE_SYNC_REASON_FLUSH: + sync_msg->reason = SFE_RULE_SYNC_REASON_FLUSH; + break; + default: + sync_msg->reason = SFE_RULE_SYNC_REASON_STATS; + break; + } + + /* + * SFE sync calling is excuted in a timer, so we can redirect it to ECM directly. + */ + sync_cb(sfe_drv_ctx->ipv4_stats_sync_data, &msg); + rcu_read_unlock(); +} + +/* + * sfe_drv_create_ipv4_rule_msg() + * convert create message format from ecm to sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv4 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +sfe_tx_status_t sfe_drv_create_ipv4_rule_msg(struct sfe_drv_ctx_instance_internal *sfe_drv_ctx, struct sfe_ipv4_msg *msg) +{ + struct sfe_connection_create sic; + struct net_device *src_dev = NULL; + struct net_device *dest_dev = NULL; + struct sfe_drv_response_msg *response; + enum sfe_cmn_response ret; + + response = sfe_drv_alloc_response_msg(SFE_DRV_MSG_TYPE_IPV4, msg); + if (!response) { + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_ENQUEUE_FAILED); + return SFE_TX_FAILURE_QUEUE; + } + + if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_CONN_VALID)) { + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_CONNECTION_INVALID); + goto failed_ret; + } + + /* + * not support bridged flows now + */ + if (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { + ret = SFE_CMN_RESPONSE_EINTERFACE; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_NOT_SUPPORT_BRIDGE); + goto failed_ret; + } + + sic.protocol = msg->msg.rule_create.tuple.protocol; + sic.src_ip.ip = msg->msg.rule_create.tuple.flow_ip; + sic.dest_ip.ip = msg->msg.rule_create.tuple.return_ip; + sic.src_ip_xlate.ip = msg->msg.rule_create.conn_rule.flow_ip_xlate; + sic.dest_ip_xlate.ip = msg->msg.rule_create.conn_rule.return_ip_xlate; + + sic.flags = 0; + switch (sic.protocol) { + case IPPROTO_TCP: + if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_TCP_VALID)) { + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_TCP_INVALID); + goto failed_ret; + } + + sic.src_port = msg->msg.rule_create.tuple.flow_ident; + sic.dest_port = msg->msg.rule_create.tuple.return_ident; + sic.src_port_xlate = msg->msg.rule_create.conn_rule.flow_ident_xlate; + sic.dest_port_xlate = msg->msg.rule_create.conn_rule.return_ident_xlate; + sic.src_td_window_scale = msg->msg.rule_create.tcp_rule.flow_window_scale; + sic.src_td_max_window = msg->msg.rule_create.tcp_rule.flow_max_window; + sic.src_td_end = msg->msg.rule_create.tcp_rule.flow_end; + sic.src_td_max_end = msg->msg.rule_create.tcp_rule.flow_max_end; + sic.dest_td_window_scale = msg->msg.rule_create.tcp_rule.return_window_scale; + sic.dest_td_max_window = msg->msg.rule_create.tcp_rule.return_max_window; + sic.dest_td_end = msg->msg.rule_create.tcp_rule.return_end; + sic.dest_td_max_end = msg->msg.rule_create.tcp_rule.return_max_end; + if (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK) { + sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + break; + + case IPPROTO_UDP: + sic.src_port = msg->msg.rule_create.tuple.flow_ident; + sic.dest_port = msg->msg.rule_create.tuple.return_ident; + sic.src_port_xlate = msg->msg.rule_create.conn_rule.flow_ident_xlate; + sic.dest_port_xlate = msg->msg.rule_create.conn_rule.return_ident_xlate; + break; + + default: + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_PROTOCOL_NOT_SUPPORT); + goto failed_ret; + } + + memcpy(sic.src_mac, msg->msg.rule_create.conn_rule.flow_mac, ETH_ALEN); + memset(sic.src_mac_xlate, 0, ETH_ALEN); + memset(sic.dest_mac, 0, ETH_ALEN); + memcpy(sic.dest_mac_xlate, msg->msg.rule_create.conn_rule.return_mac, ETH_ALEN); + + /* + * Does our input device support IP processing? + */ + src_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.flow_top_interface_num); + if (!src_dev || !sfe_drv_dev_is_layer_3_interface(src_dev, true)) { + ret = SFE_CMN_RESPONSE_EINTERFACE; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_SRC_DEV_NOT_L3); + goto failed_ret; + } + + /* + * Does our output device support IP processing? + */ + dest_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.return_top_interface_num); + if (!dest_dev || !sfe_drv_dev_is_layer_3_interface(dest_dev, true)) { + ret = SFE_CMN_RESPONSE_EINTERFACE; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_DEST_DEV_NOT_L3); + goto failed_ret; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = msg->msg.rule_create.conn_rule.flow_mtu; + sic.dest_mtu = msg->msg.rule_create.conn_rule.return_mtu; + + if (msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_QOS_VALID) { + sic.src_priority = msg->msg.rule_create.qos_rule.flow_qos_tag; + sic.dest_priority = msg->msg.rule_create.qos_rule.return_qos_tag; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + if (msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_DSCP_MARKING_VALID) { + sic.src_dscp = msg->msg.rule_create.dscp_rule.flow_dscp; + sic.dest_dscp = msg->msg.rule_create.dscp_rule.return_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + +#ifdef CONFIG_XFRM + if (msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_DIRECTION_VALID) { + sic.original_accel = msg->msg.rule_create.direction_rule.flow_accel; + sic.reply_accel = msg->msg.rule_create.direction_rule.return_accel; + } else { + sic.original_accel = sic.reply_accel = 1; + } +#endif + + if (!sfe_ipv4_create_rule(&sic)) { + /* success */ + ret = SFE_CMN_RESPONSE_ACK; + } else { + /* failed */ + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_CREATE_FAILED); + } + + /* + * fall through + */ +failed_ret: + if (src_dev) { + dev_put(src_dev); + } + + if (dest_dev) { + dev_put(dest_dev); + } + + /* + * try to queue response message + */ + ((struct sfe_ipv4_msg *)response->msg)->cm.response = msg->cm.response = ret; + sfe_drv_enqueue_msg(sfe_drv_ctx, response); + + return SFE_TX_SUCCESS; +} + +/* + * sfe_drv_destroy_ipv4_rule_msg() + * convert destroy message format from ecm to sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv4 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +sfe_tx_status_t sfe_drv_destroy_ipv4_rule_msg(struct sfe_drv_ctx_instance_internal *sfe_drv_ctx, struct sfe_ipv4_msg *msg) +{ + struct sfe_connection_destroy sid; + struct sfe_drv_response_msg *response; + + response = sfe_drv_alloc_response_msg(SFE_DRV_MSG_TYPE_IPV4, msg); + if (!response) { + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_ENQUEUE_FAILED); + return SFE_TX_FAILURE_QUEUE; + } + + sid.protocol = msg->msg.rule_destroy.tuple.protocol; + sid.src_ip.ip = msg->msg.rule_destroy.tuple.flow_ip; + sid.dest_ip.ip = msg->msg.rule_destroy.tuple.return_ip; + sid.src_port = msg->msg.rule_destroy.tuple.flow_ident; + sid.dest_port = msg->msg.rule_destroy.tuple.return_ident; + + sfe_ipv4_destroy_rule(&sid); + + /* + * try to queue response message + */ + ((struct sfe_ipv4_msg *)response->msg)->cm.response = msg->cm.response = SFE_CMN_RESPONSE_ACK; + sfe_drv_enqueue_msg(sfe_drv_ctx, response); + + return SFE_TX_SUCCESS; +} + +/* + * sfe_drv_ipv4_tx() + * Transmit an IPv4 message to the sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv4 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +sfe_tx_status_t sfe_drv_ipv4_tx(struct sfe_drv_ctx_instance *sfe_drv_ctx, struct sfe_ipv4_msg *msg) +{ + switch (msg->cm.type) { + case SFE_TX_CREATE_RULE_MSG: + return sfe_drv_create_ipv4_rule_msg(SFE_DRV_CTX_TO_PRIVATE(sfe_drv_ctx), msg); + case SFE_TX_DESTROY_RULE_MSG: + return sfe_drv_destroy_ipv4_rule_msg(SFE_DRV_CTX_TO_PRIVATE(sfe_drv_ctx), msg); + default: + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_IPV4_MSG_UNKNOW); + return SFE_TX_FAILURE_NOT_ENABLED; + } +} +EXPORT_SYMBOL(sfe_drv_ipv4_tx); + +/* + * sfe_ipv4_msg_init() + * Initialize IPv4 message. + */ +void sfe_ipv4_msg_init(struct sfe_ipv4_msg *nim, u16 if_num, u32 type, u32 len, + sfe_ipv4_msg_callback_t cb, void *app_data) +{ + sfe_cmn_msg_init(&nim->cm, if_num, type, len, (void *)cb, app_data); +} +EXPORT_SYMBOL(sfe_ipv4_msg_init); + +/* + * sfe_drv_ipv4_max_conn_count() + * return maximum number of entries SFE supported + */ +int sfe_drv_ipv4_max_conn_count(void) +{ + return SFE_MAX_CONNECTION_NUM; +} +EXPORT_SYMBOL(sfe_drv_ipv4_max_conn_count); + +/* + * sfe_drv_ipv4_notify_register() + * Register a notifier callback for IPv4 messages from sfe driver + * + * @param cb The callback pointer + * @param app_data The application context for this message + * + * @return struct sfe_drv_ctx_instance * The sfe driver context + */ +struct sfe_drv_ctx_instance *sfe_drv_ipv4_notify_register(sfe_ipv4_msg_callback_t cb, void *app_data) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + spin_lock_bh(&sfe_drv_ctx->lock); + /* + * Hook the shortcut sync callback. + */ + if (cb && !sfe_drv_ctx->ipv4_stats_sync_cb) { + sfe_ipv4_register_sync_rule_callback(sfe_drv_ipv4_stats_sync_callback); + } + + rcu_assign_pointer(sfe_drv_ctx->ipv4_stats_sync_cb, cb); + sfe_drv_ctx->ipv4_stats_sync_data = app_data; + + spin_unlock_bh(&sfe_drv_ctx->lock); + + return SFE_DRV_CTX_TO_PUBLIC(sfe_drv_ctx); +} +EXPORT_SYMBOL(sfe_drv_ipv4_notify_register); + +/* + * sfe_drv_ipv4_notify_unregister() + * Un-Register a notifier callback for IPv4 messages from sfe driver + */ +void sfe_drv_ipv4_notify_unregister(void) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + spin_lock_bh(&sfe_drv_ctx->lock); + /* + * Unregister our sync callback. + */ + if (sfe_drv_ctx->ipv4_stats_sync_cb) { + sfe_ipv4_register_sync_rule_callback(NULL); + rcu_assign_pointer(sfe_drv_ctx->ipv4_stats_sync_cb, NULL); + sfe_drv_ctx->ipv4_stats_sync_data = NULL; + } + spin_unlock_bh(&sfe_drv_ctx->lock); + + sfe_drv_clean_response_msg_by_type(sfe_drv_ctx, SFE_DRV_MSG_TYPE_IPV4); + + return; +} +EXPORT_SYMBOL(sfe_drv_ipv4_notify_unregister); + +/* + * sfe_drv_ipv6_stats_sync_callback() + * Synchronize a connection's state. + */ +static void sfe_drv_ipv6_stats_sync_callback(struct sfe_connection_sync *sis) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + struct sfe_ipv6_msg msg; + struct sfe_ipv6_conn_sync *sync_msg; + sfe_ipv6_msg_callback_t sync_cb; + + rcu_read_lock(); + sync_cb = rcu_dereference(sfe_drv_ctx->ipv6_stats_sync_cb); + if (!sync_cb) { + rcu_read_unlock(); + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_NO_SYNC_CB); + return; + } + + sync_msg = &msg.msg.conn_stats; + + memset(&msg, 0, sizeof(msg)); + sfe_cmn_msg_init(&msg.cm, 0, SFE_RX_CONN_STATS_SYNC_MSG, + sizeof(struct sfe_ipv6_conn_sync), NULL, NULL); + + /* + * fill connection specific information + */ + sync_msg->protocol = (u8)sis->protocol; + sfe_drv_ipv6_addr_copy(sis->src_ip.ip6, sync_msg->flow_ip); + sync_msg->flow_ident = sis->src_port; + + sfe_drv_ipv6_addr_copy(sis->dest_ip.ip6, sync_msg->return_ip); + sync_msg->return_ident = sis->dest_port; + + /* + * fill TCP protocol specific information + */ + if (sis->protocol == IPPROTO_TCP) { + sync_msg->flow_max_window = sis->src_td_max_window; + sync_msg->flow_end = sis->src_td_end; + sync_msg->flow_max_end = sis->src_td_max_end; + + sync_msg->return_max_window = sis->dest_td_max_window; + sync_msg->return_end = sis->dest_td_end; + sync_msg->return_max_end = sis->dest_td_max_end; + } + + /* + * fill statistics information + */ + sync_msg->flow_rx_packet_count = sis->src_new_packet_count; + sync_msg->flow_rx_byte_count = sis->src_new_byte_count; + sync_msg->flow_tx_packet_count = sis->dest_new_packet_count; + sync_msg->flow_tx_byte_count = sis->dest_new_byte_count; + + sync_msg->return_rx_packet_count = sis->dest_new_packet_count; + sync_msg->return_rx_byte_count = sis->dest_new_byte_count; + sync_msg->return_tx_packet_count = sis->src_new_packet_count; + sync_msg->return_tx_byte_count = sis->src_new_byte_count; + + /* + * fill expiration time to extend, in unit of msec + */ + sync_msg->inc_ticks = (((u32)sis->delta_jiffies) * MSEC_PER_SEC)/HZ; + + /* + * fill other information + */ + switch (sis->reason) { + case SFE_SYNC_REASON_DESTROY: + sync_msg->reason = SFE_RULE_SYNC_REASON_DESTROY; + break; + case SFE_SYNC_REASON_FLUSH: + sync_msg->reason = SFE_RULE_SYNC_REASON_FLUSH; + break; + default: + sync_msg->reason = SFE_RULE_SYNC_REASON_STATS; + break; + } + + /* + * SFE sync calling is excuted in a timer, so we can redirect it to ECM directly. + */ + sync_cb(sfe_drv_ctx->ipv6_stats_sync_data, &msg); + rcu_read_unlock(); +} + +/* + * sfe_drv_create_ipv6_rule_msg() + * convert create message format from ecm to sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv6 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +sfe_tx_status_t sfe_drv_create_ipv6_rule_msg(struct sfe_drv_ctx_instance_internal *sfe_drv_ctx, struct sfe_ipv6_msg *msg) +{ + struct sfe_connection_create sic; + struct net_device *src_dev = NULL; + struct net_device *dest_dev = NULL; + struct sfe_drv_response_msg *response; + enum sfe_cmn_response ret; + + response = sfe_drv_alloc_response_msg(SFE_DRV_MSG_TYPE_IPV6, msg); + if (!response) { + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_ENQUEUE_FAILED); + return SFE_TX_FAILURE_QUEUE; + } + + if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_CONN_VALID)) { + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_CONNECTION_INVALID); + goto failed_ret; + } + + /* + * not support bridged flows now + */ + if (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_BRIDGE_FLOW) { + ret = SFE_CMN_RESPONSE_EINTERFACE; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_NOT_SUPPORT_BRIDGE); + goto failed_ret; + } + + sic.protocol = msg->msg.rule_create.tuple.protocol; + sfe_drv_ipv6_addr_copy(msg->msg.rule_create.tuple.flow_ip, sic.src_ip.ip6); + sfe_drv_ipv6_addr_copy(msg->msg.rule_create.tuple.return_ip, sic.dest_ip.ip6); + sfe_drv_ipv6_addr_copy(msg->msg.rule_create.tuple.flow_ip, sic.src_ip_xlate.ip6); + sfe_drv_ipv6_addr_copy(msg->msg.rule_create.tuple.return_ip, sic.dest_ip_xlate.ip6); + + sic.flags = 0; + switch (sic.protocol) { + case IPPROTO_TCP: + if (!(msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_TCP_VALID)) { + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_TCP_INVALID); + goto failed_ret; + } + + sic.src_port = msg->msg.rule_create.tuple.flow_ident; + sic.dest_port = msg->msg.rule_create.tuple.return_ident; + sic.src_port_xlate = msg->msg.rule_create.tuple.flow_ident; + sic.dest_port_xlate = msg->msg.rule_create.tuple.return_ident; + sic.src_td_window_scale = msg->msg.rule_create.tcp_rule.flow_window_scale; + sic.src_td_max_window = msg->msg.rule_create.tcp_rule.flow_max_window; + sic.src_td_end = msg->msg.rule_create.tcp_rule.flow_end; + sic.src_td_max_end = msg->msg.rule_create.tcp_rule.flow_max_end; + sic.dest_td_window_scale = msg->msg.rule_create.tcp_rule.return_window_scale; + sic.dest_td_max_window = msg->msg.rule_create.tcp_rule.return_max_window; + sic.dest_td_end = msg->msg.rule_create.tcp_rule.return_end; + sic.dest_td_max_end = msg->msg.rule_create.tcp_rule.return_max_end; + if (msg->msg.rule_create.rule_flags & SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK) { + sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + break; + + case IPPROTO_UDP: + sic.src_port = msg->msg.rule_create.tuple.flow_ident; + sic.dest_port = msg->msg.rule_create.tuple.return_ident; + sic.src_port_xlate = msg->msg.rule_create.tuple.flow_ident; + sic.dest_port_xlate = msg->msg.rule_create.tuple.return_ident; + break; + + default: + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_PROTOCOL_NOT_SUPPORT); + goto failed_ret; + } + + memcpy(sic.src_mac, msg->msg.rule_create.conn_rule.flow_mac, ETH_ALEN); + memset(sic.src_mac_xlate, 0, ETH_ALEN); + memset(sic.dest_mac, 0, ETH_ALEN); + memcpy(sic.dest_mac_xlate, msg->msg.rule_create.conn_rule.return_mac, ETH_ALEN); + /* + * Does our input device support IP processing? + */ + src_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.flow_top_interface_num); + if (!src_dev || !sfe_drv_dev_is_layer_3_interface(src_dev, false)) { + ret = SFE_CMN_RESPONSE_EINTERFACE; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_SRC_DEV_NOT_L3); + goto failed_ret; + } + + /* + * Does our output device support IP processing? + */ + dest_dev = dev_get_by_index(&init_net, msg->msg.rule_create.conn_rule.return_top_interface_num); + if (!dest_dev || !sfe_drv_dev_is_layer_3_interface(dest_dev, false)) { + ret = SFE_CMN_RESPONSE_EINTERFACE; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_DEST_DEV_NOT_L3); + goto failed_ret; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = msg->msg.rule_create.conn_rule.flow_mtu; + sic.dest_mtu = msg->msg.rule_create.conn_rule.return_mtu; + + if (msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_QOS_VALID) { + sic.src_priority = msg->msg.rule_create.qos_rule.flow_qos_tag; + sic.dest_priority = msg->msg.rule_create.qos_rule.return_qos_tag; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + if (msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_DSCP_MARKING_VALID) { + sic.src_dscp = msg->msg.rule_create.dscp_rule.flow_dscp; + sic.dest_dscp = msg->msg.rule_create.dscp_rule.return_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + +#ifdef CONFIG_XFRM + if (msg->msg.rule_create.valid_flags & SFE_RULE_CREATE_DIRECTION_VALID) { + sic.original_accel = msg->msg.rule_create.direction_rule.flow_accel; + sic.reply_accel = msg->msg.rule_create.direction_rule.return_accel; + } else { + sic.original_accel = sic.reply_accel = 1; + } +#endif + + if (!sfe_ipv6_create_rule(&sic)) { + /* success */ + ret = SFE_CMN_RESPONSE_ACK; + } else { + /* failed */ + ret = SFE_CMN_RESPONSE_EMSG; + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_CREATE_FAILED); + } + + /* + * fall through + */ +failed_ret: + if (src_dev) { + dev_put(src_dev); + } + + if (dest_dev) { + dev_put(dest_dev); + } + + /* + * try to queue response message + */ + ((struct sfe_ipv6_msg *)response->msg)->cm.response = msg->cm.response = ret; + sfe_drv_enqueue_msg(sfe_drv_ctx, response); + + return SFE_TX_SUCCESS; +} + +/* + * sfe_drv_destroy_ipv6_rule_msg() + * convert destroy message format from ecm to sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv6 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +sfe_tx_status_t sfe_drv_destroy_ipv6_rule_msg(struct sfe_drv_ctx_instance_internal *sfe_drv_ctx, struct sfe_ipv6_msg *msg) +{ + struct sfe_connection_destroy sid; + struct sfe_drv_response_msg *response; + + response = sfe_drv_alloc_response_msg(SFE_DRV_MSG_TYPE_IPV6, msg); + if (!response) { + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_ENQUEUE_FAILED); + return SFE_TX_FAILURE_QUEUE; + } + + sid.protocol = msg->msg.rule_destroy.tuple.protocol; + sfe_drv_ipv6_addr_copy(msg->msg.rule_destroy.tuple.flow_ip, sid.src_ip.ip6); + sfe_drv_ipv6_addr_copy(msg->msg.rule_destroy.tuple.return_ip, sid.dest_ip.ip6); + sid.src_port = msg->msg.rule_destroy.tuple.flow_ident; + sid.dest_port = msg->msg.rule_destroy.tuple.return_ident; + + sfe_ipv6_destroy_rule(&sid); + + /* + * try to queue response message + */ + ((struct sfe_ipv6_msg *)response->msg)->cm.response = msg->cm.response = SFE_CMN_RESPONSE_ACK; + sfe_drv_enqueue_msg(sfe_drv_ctx, response); + + return SFE_TX_SUCCESS; +} + +/* + * sfe_drv_ipv6_tx() + * Transmit an IPv6 message to the sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv6 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +sfe_tx_status_t sfe_drv_ipv6_tx(struct sfe_drv_ctx_instance *sfe_drv_ctx, struct sfe_ipv6_msg *msg) +{ + switch (msg->cm.type) { + case SFE_TX_CREATE_RULE_MSG: + return sfe_drv_create_ipv6_rule_msg(SFE_DRV_CTX_TO_PRIVATE(sfe_drv_ctx), msg); + case SFE_TX_DESTROY_RULE_MSG: + return sfe_drv_destroy_ipv6_rule_msg(SFE_DRV_CTX_TO_PRIVATE(sfe_drv_ctx), msg); + default: + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_IPV6_MSG_UNKNOW); + return SFE_TX_FAILURE_NOT_ENABLED; + } +} +EXPORT_SYMBOL(sfe_drv_ipv6_tx); + +/* + * sfe_ipv6_msg_init() + * Initialize IPv6 message. + */ +void sfe_ipv6_msg_init(struct sfe_ipv6_msg *nim, u16 if_num, u32 type, u32 len, + sfe_ipv6_msg_callback_t cb, void *app_data) +{ + sfe_cmn_msg_init(&nim->cm, if_num, type, len, (void *)cb, app_data); +} +EXPORT_SYMBOL(sfe_ipv6_msg_init); + +/* + * sfe_drv_ipv6_max_conn_count() + * return maximum number of entries SFE supported + */ +int sfe_drv_ipv6_max_conn_count(void) +{ + return SFE_MAX_CONNECTION_NUM; +} +EXPORT_SYMBOL(sfe_drv_ipv6_max_conn_count); + +/* + * sfe_drv_ipv6_notify_register() + * Register a notifier callback for IPv6 messages from sfe driver + * + * @param cb The callback pointer + * @param app_data The application context for this message + * + * @return struct sfe_drv_ctx_instance * The sfe driver context + */ +struct sfe_drv_ctx_instance *sfe_drv_ipv6_notify_register(sfe_ipv6_msg_callback_t cb, void *app_data) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + spin_lock_bh(&sfe_drv_ctx->lock); + /* + * Hook the shortcut sync callback. + */ + if (cb && !sfe_drv_ctx->ipv6_stats_sync_cb) { + sfe_ipv6_register_sync_rule_callback(sfe_drv_ipv6_stats_sync_callback); + } + + rcu_assign_pointer(sfe_drv_ctx->ipv6_stats_sync_cb, cb); + sfe_drv_ctx->ipv6_stats_sync_data = app_data; + + spin_unlock_bh(&sfe_drv_ctx->lock); + + return SFE_DRV_CTX_TO_PUBLIC(sfe_drv_ctx); +} +EXPORT_SYMBOL(sfe_drv_ipv6_notify_register); + +/* + * sfe_drv_ipv6_notify_unregister() + * Un-Register a notifier callback for IPv6 messages from sfe driver + */ +void sfe_drv_ipv6_notify_unregister(void) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + spin_lock_bh(&sfe_drv_ctx->lock); + /* + * Unregister our sync callback. + */ + if (sfe_drv_ctx->ipv6_stats_sync_cb) { + sfe_ipv6_register_sync_rule_callback(NULL); + rcu_assign_pointer(sfe_drv_ctx->ipv6_stats_sync_cb, NULL); + sfe_drv_ctx->ipv6_stats_sync_data = NULL; + } + spin_unlock_bh(&sfe_drv_ctx->lock); + + sfe_drv_clean_response_msg_by_type(sfe_drv_ctx, SFE_DRV_MSG_TYPE_IPV6); + + return; +} +EXPORT_SYMBOL(sfe_drv_ipv6_notify_unregister); + +/* + * sfe_tun6rd_tx() + * Transmit a tun6rd message to sfe engine + */ +sfe_tx_status_t sfe_tun6rd_tx(struct sfe_drv_ctx_instance *sfe_drv_ctx, struct sfe_tun6rd_msg *msg) +{ + sfe_drv_incr_exceptions(SFE_DRV_EXCEPTION_NOT_SUPPORT_6RD); + return SFE_TX_FAILURE_NOT_ENABLED; +} +EXPORT_SYMBOL(sfe_tun6rd_tx); + +/* + * sfe_tun6rd_msg_init() + * Initialize sfe_tun6rd msg. + */ +void sfe_tun6rd_msg_init(struct sfe_tun6rd_msg *ncm, u16 if_num, u32 type, u32 len, void *cb, void *app_data) +{ + sfe_cmn_msg_init(&ncm->cm, if_num, type, len, cb, app_data); +} +EXPORT_SYMBOL(sfe_tun6rd_msg_init); + +/* + * sfe_drv_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_drv_recv(struct sk_buff *skb) +{ + struct net_device *dev; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + +/* + * TODO: Remove the check when INgress Qdisc is ported to 5.4 kernel. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 4, 0)) +#ifdef CONFIG_NET_CLS_ACT + /* + * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet + * We can not accelerate this packet. + */ + if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) { + return 0; + } +#endif +#endif + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + if (sfe_drv_dev_is_layer_3_interface(dev, true)) { + return sfe_ipv4_recv(dev, skb); + } else { + DEBUG_TRACE("no IPv4 address for device: %s\n", dev->name); + return 0; + } + } + + if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + if (sfe_drv_dev_is_layer_3_interface(dev, false)) { + return sfe_ipv6_recv(dev, skb); + } else { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + return 0; + } + } + + DEBUG_TRACE("not IP packet\n"); + return 0; +} + +/* + * sfe_drv_get_exceptions() + * dump exception counters + */ +static ssize_t sfe_drv_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + spin_lock_bh(&sfe_drv_ctx->lock); + for (len = 0, idx = 0; idx < SFE_DRV_EXCEPTION_MAX; idx++) { + if (sfe_drv_ctx->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_drv_exception_events_string[idx], sfe_drv_ctx->exceptions[idx]); + } + } + spin_unlock_bh(&sfe_drv_ctx->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_drv_exceptions_attr = + __ATTR(exceptions, S_IRUGO, sfe_drv_get_exceptions, NULL); + +/* + * sfe_drv_init() + */ +static int __init sfe_drv_init(void) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + int result = -1; + + /* + * Create sys/sfe_drv + */ + sfe_drv_ctx->sys_sfe_drv = kobject_create_and_add("sfe_drv", NULL); + if (!sfe_drv_ctx->sys_sfe_drv) { + DEBUG_ERROR("failed to register sfe_drv\n"); + goto exit1; + } + + /* + * Create sys/sfe_drv/exceptions + */ + result = sysfs_create_file(sfe_drv_ctx->sys_sfe_drv, &sfe_drv_exceptions_attr.attr); + if (result) { + DEBUG_ERROR("failed to register exceptions file: %d\n", result); + goto exit2; + } + + spin_lock_init(&sfe_drv_ctx->lock); + + INIT_LIST_HEAD(&sfe_drv_ctx->msg_queue); + INIT_WORK(&sfe_drv_ctx->work, sfe_drv_process_response_msg); + + /* + * Hook the receive path in the network stack. + */ + BUG_ON(athrs_fast_nat_recv); + RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_drv_recv); + + return 0; +exit2: + kobject_put(sfe_drv_ctx->sys_sfe_drv); +exit1: + return result; +} + +/* + * sfe_drv_exit() + */ +static void __exit sfe_drv_exit(void) +{ + struct sfe_drv_ctx_instance_internal *sfe_drv_ctx = &__sfe_drv_ctx; + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(athrs_fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + + /* + * stop work queue, and flush all pending message in queue + */ + cancel_work_sync(&sfe_drv_ctx->work); + sfe_drv_process_response_msg(&sfe_drv_ctx->work); + + /* + * Unregister our sync callback. + */ + sfe_drv_ipv4_notify_unregister(); + sfe_drv_ipv6_notify_unregister(); + + kobject_put(sfe_drv_ctx->sys_sfe_drv); + + return; +} + +module_init(sfe_drv_init) +module_exit(sfe_drv_exit) + +MODULE_AUTHOR("Qualcomm Atheros Inc."); +MODULE_DESCRIPTION("Simulated driver for Shortcut Forwarding Engine"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/shortcut-fe/simulated-driver/sfe_drv.h b/shortcut-fe/simulated-driver/sfe_drv.h new file mode 100644 index 000000000..729caffd3 --- /dev/null +++ b/shortcut-fe/simulated-driver/sfe_drv.h @@ -0,0 +1,553 @@ +/* + * sfe_drv.h + * simulated driver headers for shortcut forwarding engine. + * + * Copyright (c) 2015,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __SFE_DRV_H +#define __SFE_DRV_H + +#define MAX_VLAN_DEPTH 2 +#define SFE_VLAN_ID_NOT_CONFIGURED 0xfff +#define SFE_MC_IF_MAX 16 + +#define SFE_SPECIAL_INTERFACE_BASE 0x7f00 +#define SFE_SPECIAL_INTERFACE_IPV4 (SFE_SPECIAL_INTERFACE_BASE + 1) +#define SFE_SPECIAL_INTERFACE_IPV6 (SFE_SPECIAL_INTERFACE_BASE + 2) +#define SFE_SPECIAL_INTERFACE_IPSEC (SFE_SPECIAL_INTERFACE_BASE + 3) +#define SFE_SPECIAL_INTERFACE_L2TP (SFE_SPECIAL_INTERFACE_BASE + 4) +#define SFE_SPECIAL_INTERFACE_PPTP (SFE_SPECIAL_INTERFACE_BASE + 5) + +/** + * Rule creation & rule update flags. + */ +#define SFE_RULE_CREATE_FLAG_NO_SEQ_CHECK (1<<0) /**< Do not perform TCP sequence number checks */ +#define SFE_RULE_CREATE_FLAG_BRIDGE_FLOW (1<<1) /**< This is a pure bridge forwarding flow */ +#define SFE_RULE_CREATE_FLAG_ROUTED (1<<2) /**< Rule is for a routed connection */ +#define SFE_RULE_CREATE_FLAG_DSCP_MARKING (1<<3) /**< Rule has for a DSCP marking configured*/ +#define SFE_RULE_CREATE_FLAG_VLAN_MARKING (1<<4) /**< Rule has for a VLAN marking configured*/ +#define SFE_RULE_UPDATE_FLAG_CHANGE_MTU (1<<5) /**< Update MTU of connection interfaces */ +#define SFE_RULE_CREATE_FLAG_ICMP_NO_CME_FLUSH (1<<6)/**< Rule for not flushing CME on ICMP pkt */ +#define SFE_RULE_CREATE_FLAG_L2_ENCAP (1<<7) /**< consists of an encapsulating protocol that carries an IPv4 payload within it. */ +#define SFE_RULE_CREATE_FLAG_MC_JOIN (1<<8) /**< Interface has joined the flow */ +#define SFE_RULE_CREATE_FLAG_MC_LEAVE (1<<9) /**< Interface has left the flow */ +#define SFE_RULE_CREATE_FLAG_MC_UPDATE (1<<10)/**< Multicast Rule update */ +/** + * Rule creation validity flags. + */ +#define SFE_RULE_CREATE_CONN_VALID (1<<0) /**< IPv4 Connection is valid */ +#define SFE_RULE_CREATE_TCP_VALID (1<<1) /**< TCP Protocol fields are valid */ +#define SFE_RULE_CREATE_PPPOE_VALID (1<<2) /**< PPPoE fields are valid */ +#define SFE_RULE_CREATE_QOS_VALID (1<<3) /**< QoS fields are valid */ +#define SFE_RULE_CREATE_VLAN_VALID (1<<4) /**< VLAN fields are valid */ +#define SFE_RULE_CREATE_DSCP_MARKING_VALID (1<<5) /**< DSCP marking fields are valid */ +#define SFE_RULE_CREATE_VLAN_MARKING_VALID (1<<6) /**< VLAN marking fields are valid */ +#define SFE_RULE_CREATE_MC_NAT_VALID (1<<7) /**< Interface is configured with Source-NAT */ +#define SFE_RULE_CREATE_DIRECTION_VALID (1<<8) /**< specify acceleration directions */ + +/* + * 32/64 bits pointer type + */ +#ifdef __LP64__ +typedef uint64_t sfe_ptr_t; +#else +typedef uint32_t sfe_ptr_t; +#endif + +typedef enum sfe_rule_sync_reason { + SFE_RULE_SYNC_REASON_STATS, /* Sync is to synchronize stats */ + SFE_RULE_SYNC_REASON_FLUSH, /* Sync is to flush a entry */ + SFE_RULE_SYNC_REASON_EVICT, /* Sync is to evict a entry */ + SFE_RULE_SYNC_REASON_DESTROY /* Sync is to destroy a entry(requested by connection manager) */ + +} sfe_rule_sync_reason_t; + +/** + * Tx command status + */ +typedef enum { + SFE_TX_SUCCESS = 0, /**< Success */ + SFE_TX_FAILURE, /**< Command failure other than descriptor not available */ + SFE_TX_FAILURE_QUEUE, /**< Command failure due to descriptor not available */ + SFE_TX_FAILURE_NOT_READY, /**< Command failure due to SFE state uninitialized */ + SFE_TX_FAILURE_TOO_LARGE, /**< Command is too large to fit in one message */ + SFE_TX_FAILURE_TOO_SHORT, /**< Command/Packet is shorter than expected size */ + SFE_TX_FAILURE_NOT_SUPPORTED, /**< Command/Packet not accepted for forwarding */ + SFE_TX_FAILURE_BAD_PARAM, /**< Command failure due to bad parameters */ + SFE_TX_FAILURE_NOT_ENABLED, /**< Command failure due to SFE feature is not enabled */ +} sfe_tx_status_t; + +/** + * Common response structure + */ +enum sfe_cmn_response { + SFE_CMN_RESPONSE_ACK, /**< Message Acknowledge */ + SFE_CMN_RESPONSE_EVERSION, /**< Message Version Error */ + SFE_CMN_RESPONSE_EINTERFACE, /**< Message Interface Error */ + SFE_CMN_RESPONSE_ELENGTH, /**< Message Length Error */ + SFE_CMN_RESPONSE_EMSG, /**< Message Error */ + SFE_CMM_RESPONSE_NOTIFY, /**< Message Independant of Request */ + SFE_CMN_RESPONSE_LAST +}; + +/** + * IPv4 bridge/route rule messages + */ +enum sfe_message_types { + SFE_TX_CREATE_RULE_MSG, /**< IPv4/6 create rule message */ + SFE_TX_DESTROY_RULE_MSG, /**< IPv4/6 destroy rule message */ + SFE_RX_CONN_STATS_SYNC_MSG, /**< IPv4/6 connection stats sync message */ + SFE_TX_CREATE_MC_RULE_MSG, /**< IPv4/6 multicast create rule message */ + SFE_TUN6RD_ADD_UPDATE_PEER, /**< Add/update peer for 6rd tunnel */ + SFE_MAX_MSG_TYPES, /**< IPv4/6 message max type number */ +}; + +/** + * Common message structure + */ +struct sfe_cmn_msg { + u16 version; /**< Version id for main message format */ + u16 interface; /**< Primary Key for all messages */ + enum sfe_cmn_response response; /**< Primary response */ + u32 type; /**< Decetralized request #, to be used to match response # */ + u32 error; /**< Decentralized specific error message, response == EMSG */ + sfe_ptr_t cb; /**< Place for callback pointer */ + sfe_ptr_t app_data; /**< Place for app data */ + u32 len; /**< What is the length of the message excluding this header */ +}; + +/** + * Common 5 tuple structure + */ +struct sfe_ipv4_5tuple { + __be32 flow_ip; /**< Flow IP address */ + __be32 return_ip; /**< Return IP address */ + __be16 flow_ident; /**< Flow ident (e.g. TCP/UDP port) */ + __be16 return_ident; /**< Return ident (e.g. TCP/UDP port) */ + u8 protocol; /**< Protocol number */ + u8 reserved[3]; /**< Padded for alignment */ +}; + +/** + * Common 5 tuple structure + */ +struct sfe_ipv6_5tuple { + __be32 flow_ip[4]; /**< Flow IP address */ + __be32 return_ip[4]; /**< Return IP address */ + __be16 flow_ident; /**< Flow ident (e.g. TCP/UDP port) */ + __be16 return_ident; /**< Return ident (e.g. TCP/UDP port) */ + u8 protocol; /**< Protocol number */ + u8 reserved[3]; /**< Padded for alignment */ +}; + +/** + * Connection create structure + */ +struct sfe_ipv4_connection_rule { + u8 flow_mac[6]; /**< Flow MAC address */ + u8 return_mac[6]; /**< Return MAC address */ + s32 flow_interface_num; /**< Flow interface number */ + s32 return_interface_num; /**< Return interface number */ + s32 flow_top_interface_num; /* Top flow interface number */ + s32 return_top_interface_num;/* Top return interface number */ + u32 flow_mtu; /**< Flow interface`s MTU */ + u32 return_mtu; /**< Return interface`s MTU */ + __be32 flow_ip_xlate; /**< Translated flow IP address */ + __be32 return_ip_xlate; /**< Translated return IP address */ + __be16 flow_ident_xlate; /**< Translated flow ident (e.g. port) */ + __be16 return_ident_xlate; /**< Translated return ident (e.g. port) */ +}; + +/** + * Connection create structure + */ +struct sfe_ipv6_connection_rule { + u8 flow_mac[6]; /**< Flow MAC address */ + u8 return_mac[6]; /**< Return MAC address */ + s32 flow_interface_num; /**< Flow interface number */ + s32 return_interface_num; /**< Return interface number */ + s32 flow_top_interface_num; /* Top flow interface number */ + s32 return_top_interface_num;/* Top return interface number */ + u32 flow_mtu; /**< Flow interface's MTU */ + u32 return_mtu; /**< Return interface's MTU */ +}; + +/** + * TCP connection rule structure + */ +struct sfe_protocol_tcp_rule { + u32 flow_max_window; /**< Flow direction's largest seen window */ + u32 return_max_window; /**< Return direction's largest seen window */ + u32 flow_end; /**< Flow direction's largest seen sequence + segment length */ + u32 return_end; /**< Return direction's largest seen sequence + segment length */ + u32 flow_max_end; /**< Flow direction's largest seen ack + max(1, win) */ + u32 return_max_end; /**< Return direction's largest seen ack + max(1, win) */ + u8 flow_window_scale; /**< Flow direction's window scaling factor */ + u8 return_window_scale; /**< Return direction's window scaling factor */ + u16 reserved; /**< Padded for alignment */ +}; + +/** + * PPPoE connection rules structure + */ +struct sfe_pppoe_rule { + u16 flow_pppoe_session_id; /**< Flow direction`s PPPoE session ID. */ + u16 flow_pppoe_remote_mac[3]; /**< Flow direction`s PPPoE Server MAC address */ + u16 return_pppoe_session_id; /**< Return direction's PPPoE session ID. */ + u16 return_pppoe_remote_mac[3]; /**< Return direction's PPPoE Server MAC address */ +}; + +/** + * QoS connection rule structure + */ +struct sfe_qos_rule { + u32 flow_qos_tag; /**< QoS tag associated with this rule for flow direction */ + u32 return_qos_tag; /**< QoS tag associated with this rule for return direction */ +}; + +/** + * DSCP connection rule structure + */ +struct sfe_dscp_rule { + u8 flow_dscp; /**< Egress DSCP value for flow direction */ + u8 return_dscp; /**< Egress DSCP value for return direction */ + u8 reserved[2]; /**< Padded for alignment */ +}; + +/** + * VLAN connection rule structure + */ +struct sfe_vlan_rule { + u32 ingress_vlan_tag; /**< VLAN Tag for the ingress packets */ + u32 egress_vlan_tag; /**< VLAN Tag for egress packets */ +}; + +/** + * Acceleration direction rule structure + * Sometimes we just want to accelerate traffic in one direction but not in another. + */ +struct sfe_acceleration_direction_rule { + u8 flow_accel; /**< Accelerate in flow direction */ + u8 return_accel; /**< Accelerate in return direction */ + u8 reserved[2]; /**< Padded for alignment */ +}; + +/** + * The IPv4 rule create sub-message structure. + */ +struct sfe_ipv4_rule_create_msg { + /* Request */ + u16 valid_flags; /**< Bit flags associated with the validity of parameters */ + u16 rule_flags; /**< Bit flags associated with the rule */ + + struct sfe_ipv4_5tuple tuple; /**< Holds values of the 5 tuple */ + + struct sfe_ipv4_connection_rule conn_rule; /**< Basic connection specific data */ + struct sfe_protocol_tcp_rule tcp_rule; /**< TCP related accleration parameters */ + struct sfe_pppoe_rule pppoe_rule; /**< PPPoE related accleration parameters */ + struct sfe_qos_rule qos_rule; /**< QoS related accleration parameters */ + struct sfe_dscp_rule dscp_rule; /**< DSCP related accleration parameters */ + struct sfe_vlan_rule vlan_primary_rule; /**< Primary VLAN related accleration parameters */ + struct sfe_vlan_rule vlan_secondary_rule; /**< Secondary VLAN related accleration parameters */ +#ifdef CONFIG_XFRM + struct sfe_acceleration_direction_rule direction_rule;/* Direction related accleration parameters*/ +#endif + /* Response */ + u32 index; /**< Slot ID for cache stats to host OS */ +}; + +/** + * The IPv4 rule destroy sub-message structure. + */ +struct sfe_ipv4_rule_destroy_msg { + struct sfe_ipv4_5tuple tuple; /**< Holds values of the 5 tuple */ +}; + +/** + * The SFE IPv4 rule sync structure. + */ +struct sfe_ipv4_conn_sync { + u32 index; /**< Slot ID for cache stats to host OS */ + u8 protocol; /**< Protocol number */ + __be32 flow_ip; /**< Flow IP address */ + __be32 flow_ip_xlate; /**< Translated flow IP address */ + __be16 flow_ident; /**< Flow ident (e.g. port) */ + __be16 flow_ident_xlate; /**< Translated flow ident (e.g. port) */ + u32 flow_max_window; /**< Flow direction's largest seen window */ + u32 flow_end; /**< Flow direction's largest seen sequence + segment length */ + u32 flow_max_end; /**< Flow direction's largest seen ack + max(1, win) */ + u32 flow_rx_packet_count; /**< Flow interface's RX packet count */ + u32 flow_rx_byte_count; /**< Flow interface's RX byte count */ + u32 flow_tx_packet_count; /**< Flow interface's TX packet count */ + u32 flow_tx_byte_count; /**< Flow interface's TX byte count */ + u16 flow_pppoe_session_id; /**< Flow interface`s PPPoE session ID. */ + u16 flow_pppoe_remote_mac[3]; + /**< Flow interface's PPPoE remote server MAC address if there is any */ + __be32 return_ip; /**< Return IP address */ + __be32 return_ip_xlate; /**< Translated return IP address */ + __be16 return_ident; /**< Return ident (e.g. port) */ + __be16 return_ident_xlate; /**< Translated return ident (e.g. port) */ + u32 return_max_window; /**< Return direction's largest seen window */ + u32 return_end; /**< Return direction's largest seen sequence + segment length */ + u32 return_max_end; /**< Return direction's largest seen ack + max(1, win) */ + u32 return_rx_packet_count; + /**< Return interface's RX packet count */ + u32 return_rx_byte_count; /**< Return interface's RX byte count */ + u32 return_tx_packet_count; + /**< Return interface's TX packet count */ + u32 return_tx_byte_count; /**< Return interface's TX byte count */ + u16 return_pppoe_session_id; + /**< Return interface`s PPPoE session ID. */ + u16 return_pppoe_remote_mac[3]; + /**< Return interface's PPPoE remote server MAC address if there is any */ + u32 inc_ticks; /**< Number of ticks since the last sync */ + u32 reason; /**< Reason for the sync */ + + u8 flags; /**< Bit flags associated with the rule */ + u32 qos_tag; /**< QoS Tag */ + u32 cause; /**< Flush Cause */ +}; + +/* + * Message structure to send/receive IPv4 bridge/route commands + */ +struct sfe_ipv4_msg { + struct sfe_cmn_msg cm; /**< Message Header */ + union { + struct sfe_ipv4_rule_create_msg rule_create; /**< Message: rule create */ + struct sfe_ipv4_rule_destroy_msg rule_destroy; /**< Message: rule destroy */ + struct sfe_ipv4_conn_sync conn_stats; /**< Message: connection stats sync */ + } msg; +}; + +/** + * Callback to be called when IPv4 message is received + */ +typedef void (*sfe_ipv4_msg_callback_t)(void *app_data, struct sfe_ipv4_msg *msg); + +/** + * The IPv6 rule create sub-message structure. + */ +struct sfe_ipv6_rule_create_msg { + /* + * Request + */ + u16 valid_flags; /**< Bit flags associated with the validity of parameters */ + u16 rule_flags; /**< Bit flags associated with the rule */ + struct sfe_ipv6_5tuple tuple; /**< Holds values of the 5 tuple */ + struct sfe_ipv6_connection_rule conn_rule; /**< Basic connection specific data */ + struct sfe_protocol_tcp_rule tcp_rule; /**< Protocol related accleration parameters */ + struct sfe_pppoe_rule pppoe_rule; /**< PPPoE related accleration parameters */ + struct sfe_qos_rule qos_rule; /**< QoS related accleration parameters */ + struct sfe_dscp_rule dscp_rule; /**< DSCP related accleration parameters */ + struct sfe_vlan_rule vlan_primary_rule; /**< VLAN related accleration parameters */ + struct sfe_vlan_rule vlan_secondary_rule; /**< VLAN related accleration parameters */ +#ifdef CONFIG_XFRM + struct sfe_acceleration_direction_rule direction_rule;/* Direction related accleration parameters*/ +#endif + /* + * Response + */ + u32 index; /**< Slot ID for cache stats to host OS */ +}; + +/** + * The IPv6 rule destroy sub-message structure. + */ +struct sfe_ipv6_rule_destroy_msg { + struct sfe_ipv6_5tuple tuple; /**< Holds values of the 5 tuple */ +}; + +/** + * The SFE IPv6 rule sync structure. + */ +struct sfe_ipv6_conn_sync { + u32 index; /**< Slot ID for cache stats to host OS */ + u8 protocol; /**< Protocol number */ + __be32 flow_ip[4]; /**< Flow IP address */ + __be16 flow_ident; /**< Flow ident (e.g. port) */ + u32 flow_max_window; /**< Flow direction's largest seen window */ + u32 flow_end; /**< Flow direction's largest seen sequence + segment length */ + u32 flow_max_end; /**< Flow direction's largest seen ack + max(1, win) */ + u32 flow_rx_packet_count; /**< Flow interface's RX packet count */ + u32 flow_rx_byte_count; /**< Flow interface's RX byte count */ + u32 flow_tx_packet_count; /**< Flow interface's TX packet count */ + u32 flow_tx_byte_count; /**< Flow interface's TX byte count */ + u16 flow_pppoe_session_id; /**< Flow interface`s PPPoE session ID. */ + u16 flow_pppoe_remote_mac[3]; + /**< Flow interface's PPPoE remote server MAC address if there is any */ + __be32 return_ip[4]; /**< Return IP address */ + __be16 return_ident; /**< Return ident (e.g. port) */ + u32 return_max_window; /**< Return direction's largest seen window */ + u32 return_end; /**< Return direction's largest seen sequence + segment length */ + u32 return_max_end; /**< Return direction's largest seen ack + max(1, win) */ + u32 return_rx_packet_count; + /**< Return interface's RX packet count */ + u32 return_rx_byte_count; /**< Return interface's RX byte count */ + u32 return_tx_packet_count; + /**< Return interface's TX packet count */ + u32 return_tx_byte_count; /**< Return interface's TX byte count */ + u16 return_pppoe_session_id; + /**< Return interface`s PPPoE session ID. */ + u16 return_pppoe_remote_mac[3]; + /**< Return interface's PPPoE remote server MAC address if there is any */ + u32 inc_ticks; /**< Number of ticks since the last sync */ + u32 reason; /**< Reason for the sync */ + u8 flags; /**< Bit flags associated with the rule */ + u32 qos_tag; /**< QoS Tag */ + u32 cause; /**< Flush cause associated with the rule */ +}; + +/** + * Message structure to send/receive IPv6 bridge/route commands + */ +struct sfe_ipv6_msg { + struct sfe_cmn_msg cm; /**< Message Header */ + union { + struct sfe_ipv6_rule_create_msg rule_create; /**< Message: rule create */ + struct sfe_ipv6_rule_destroy_msg rule_destroy; /**< Message: rule destroy */ + struct sfe_ipv6_conn_sync conn_stats; /**< Message: stats sync */ + } msg; +}; + +/** + * Callback to be called when IPv6 message is received + */ +typedef void (*sfe_ipv6_msg_callback_t)(void *app_data, struct sfe_ipv6_msg *msg); + +/** + * 6rd tunnel peer addr. + */ +struct sfe_tun6rd_set_peer_msg { + __be32 ipv6_address[4]; /* The peer's ipv6 addr*/ + __be32 dest; /* The peer's ipv4 addr*/ +}; + +/** + * Message structure to send/receive 6rd tunnel messages + */ +struct sfe_tun6rd_msg { + struct sfe_cmn_msg cm; /* Message Header */ + union { + struct sfe_tun6rd_set_peer_msg peer; /* Message: add/update peer */ + } msg; +}; + +/* + * sfe driver context instance + */ +struct sfe_drv_ctx_instance { + int not_used; +}; + +/* + * sfe_drv_ipv4_max_conn_count() + * Return the maximum number of IPv4 connections that the sfe acceleration engine supports + * + * @return int The number of connections that can be accelerated by the sfe + */ +int sfe_drv_ipv4_max_conn_count(void); + +/* + * sfe_drv_ipv4_tx() + * Transmit an IPv4 message to the sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv4 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +extern sfe_tx_status_t sfe_drv_ipv4_tx(struct sfe_drv_ctx_instance *sfe_drv_ctx, struct sfe_ipv4_msg *msg); + +/* + * sfe_drv_ipv4_notify_register() + * Register a notifier callback for IPv4 messages from sfe driver + * + * @param cb The callback pointer + * @param app_data The application context for this message + * + * @return struct sfe_drv_ctx_instance * The sfe driver context + */ +extern struct sfe_drv_ctx_instance *sfe_drv_ipv4_notify_register(sfe_ipv4_msg_callback_t cb, void *app_data); + +/* + * sfe_drv_ipv4_notify_unregister() + * Un-Register a notifier callback for IPv4 messages from sfe driver + */ +extern void sfe_drv_ipv4_notify_unregister(void); + +/* + * sfe_ipv4_msg_init() + * IPv4 message init + */ +extern void sfe_ipv4_msg_init(struct sfe_ipv4_msg *nim, u16 if_num, u32 type, u32 len, + sfe_ipv4_msg_callback_t cb, void *app_data); + +/* + * sfe_drv_ipv6_max_conn_count() + * Return the maximum number of IPv6 connections that the sfe acceleration engine supports + * + * @return int The number of connections that can be accelerated by the sfe + */ +int sfe_drv_ipv6_max_conn_count(void); + +/* + * sfe_drv_ipv6_tx() + * Transmit an IPv6 message to the sfe + * + * @param sfe_drv_ctx sfe driver context + * @param msg The IPv6 message + * + * @return sfe_tx_status_t The status of the Tx operation + */ +extern sfe_tx_status_t sfe_drv_ipv6_tx(struct sfe_drv_ctx_instance *sfe_drv_ctx, struct sfe_ipv6_msg *msg); + +/* + * sfe_drv_ipv6_notify_register() + * Register a notifier callback for IPv6 messages from sfe driver + * + * @param cb The callback pointer + * @param app_data The application context for this message + * + * @return struct sfe_drv_ctx_instance * The sfe driver context + */ +extern struct sfe_drv_ctx_instance *sfe_drv_ipv6_notify_register(sfe_ipv6_msg_callback_t cb, void *app_data); + +/* + * sfe_drv_ipv6_notify_unregister() + * Un-Register a notifier callback for IPv6 messages from sfe driver + */ +extern void sfe_drv_ipv6_notify_unregister(void); + +/* + * sfe_ipv6_msg_init() + * IPv6 message init + */ +extern void sfe_ipv6_msg_init(struct sfe_ipv6_msg *nim, u16 if_num, u32 type, u32 len, + sfe_ipv6_msg_callback_t cb, void *app_data); + +/* + * sfe_tun6rd_tx() + * Transmit a tun6rd message to sfe engine + */ +sfe_tx_status_t sfe_tun6rd_tx(struct sfe_drv_ctx_instance *sfe_ctx, struct sfe_tun6rd_msg *msg); + +/* + * sfe_tun6rd_msg_init() + * Initialize sfe_tun6rd msg. + */ +void sfe_tun6rd_msg_init(struct sfe_tun6rd_msg *ncm, u16 if_num, u32 type, u32 len, + void *cb, void *app_data); + +#endif /* __SFE_DRV_H */ diff --git a/simulated-driver/Makefile b/simulated-driver/Makefile deleted file mode 100755 index 0c710fe9d..000000000 --- a/simulated-driver/Makefile +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright (c) 2015,2016 The Linux Foundation. All rights reserved. -# Permission to use, copy, modify, and/or distribute this software for -# any purpose with or without fee is hereby granted, provided that the -# above copyright notice and this permission notice appear in all copies. -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -# - -include $(TOPDIR)/rules.mk -include $(INCLUDE_DIR)/kernel.mk - -PKG_NAME:=shortcut-fe-simulated-driver -PKG_RELEASE:=1 - -PKG_SOURCE_URL:=https://source.codeaurora.org/quic/qsdk/oss/lklm/shortcut-fe -PKG_SOURCE_PROTO:=git -PKG_SOURCE_DATE:=2021-03-17 -PKG_SOURCE_VERSION:=697977d8d0ccf0ab596e5692d08608a75dd7f33d -PKG_MIRROR_HASH:=659fa82a431e15af797a6c7069faeee02810453ad8b576c51c29f95a1761a045 - -include $(INCLUDE_DIR)/package.mk - -define KernelPackage/shortcut-fe-drv - SECTION:=kernel - CATEGORY:=Kernel modules - SUBMENU:=Network Support - DEPENDS:=@TARGET_ipq60xx||@TARGET_ipq806x||TARGET_ipq807x +kmod-shortcut-fe - KCONFIG:= \ - CONFIG_NET_CLS_ACT=y \ - CONFIG_XFRM=y - TITLE:=Simulated sfe driver for ECM - FILES:=$(PKG_BUILD_DIR)/simulated-driver/shortcut-fe-drv.ko -endef - -define KernelPackage/shortcut-fe-drv/Description -Simulated sfe driver which act as an adapter to convert message -between a connection manager and the SFE core engine. -endef - -define Build/Compile - $(MAKE) $(PKG_JOBS) -C "$(LINUX_DIR)" \ - $(KERNEL_MAKE_FLAGS) \ - $(PKG_MAKE_FLAGS) \ - M="$(PKG_BUILD_DIR)/simulated-driver" \ - EXTRA_CFLAGS="-DSFE_SUPPORT_IPV6" \ - modules -endef - -define Build/InstallDev - $(INSTALL_DIR) $(1)/usr/include/shortcut-fe - $(CP) -rf $(PKG_BUILD_DIR)/simulated-driver/sfe_drv.h $(1)/usr/include/shortcut-fe -endef - -$(eval $(call KernelPackage,shortcut-fe-drv)) diff --git a/simulated-driver/patches/200-nss-qdisc-support.patch b/simulated-driver/patches/200-nss-qdisc-support.patch deleted file mode 100755 index 638ad8a84..000000000 --- a/simulated-driver/patches/200-nss-qdisc-support.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- ./simulated-driver/sfe_drv.c.orig 2020-06-16 12:49:47.680153371 +0800 -+++ ./simulated-driver/sfe_drv.c 2020-06-16 12:50:18.540153371 +0800 -@@ -1167,7 +1167,7 @@ int sfe_drv_recv(struct sk_buff *skb) - * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet - * We can not accelerate this packet. - */ -- if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) { -+ if (dev->ingress_queue && !(skb->tc_verd_qca_nss & TC_NCLS)) { - return 0; - } - #endif