From 5ed20a68cd6ca4adc0aa2d240913d604a2eb3e25 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:05 -0700 Subject: [PATCH 1/8] flow_dissector: Abstract out hash computation Move the hash computation located in __skb_get_hash to be a separate function which takes flow_keys as input. This will allow flow hash computation in other contexts where we only have addresses and ports. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_keys.h | 1 + net/core/flow_dissector.c | 44 +++++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index fbefdca5e283..6667a054763a 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -29,4 +29,5 @@ struct flow_keys { bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow); __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto); +u32 flow_hash_from_keys(struct flow_keys *keys); #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c2b53c1b21d2..2ff8cd4dfc5f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -202,6 +202,33 @@ static __always_inline u32 __flow_hash_1word(u32 a) return jhash_1word(a, hashrnd); } +static inline u32 __flow_hash_from_keys(struct flow_keys *keys) +{ + u32 hash; + + /* get a consistent hash (same value on both flow directions) */ + if (((__force u32)keys->dst < (__force u32)keys->src) || + (((__force u32)keys->dst == (__force u32)keys->src) && + ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { + swap(keys->dst, keys->src); + swap(keys->port16[0], keys->port16[1]); + } + + hash = __flow_hash_3words((__force u32)keys->dst, + (__force u32)keys->src, + (__force u32)keys->ports); + if (!hash) + hash = 1; + + return hash; +} + +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + return __flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(flow_hash_from_keys); + /* * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value @@ -211,7 +238,6 @@ static __always_inline u32 __flow_hash_1word(u32 a) void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; if (!skb_flow_dissect(skb, &keys)) return; @@ -219,21 +245,7 @@ void __skb_get_hash(struct sk_buff *skb) if (keys.ports) skb->l4_hash = 1; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); - } - - hash = __flow_hash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports); - if (!hash) - hash = 1; - - skb->hash = hash; + skb->hash = __flow_hash_from_keys(&keys); } EXPORT_SYMBOL(__skb_get_hash); From b73c3d0e4f0e1961e15bec18720e48aabebe2109 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:17 -0700 Subject: [PATCH 2/8] net: Save TX flow hash in sock and set in skbuf on xmit For a connected socket we can precompute the flow hash for setting in skb->hash on output. This is a performance advantage over calculating the skb->hash for every packet on the connection. The computation is done using the common hash algorithm to be consistent with computations done for packets of the connection in other states where thers is no socket (e.g. time-wait, syn-recv, syn-cookies). This patch adds sk_txhash to the sock structure. inet_set_txhash and ip6_set_txhash functions are added which are called from points in TCP and UDP where socket moves to established state. skb_set_hash_from_sk is a function which sets skb->hash from the sock txhash value. This is called in UDP and TCP transmit path when transmitting within the context of a socket. Tested: ran super_netperf with 200 TCP_RR streams over a vxlan interface (in this case skb_get_hash called on every TX packet to create a UDP source port). Before fix: 95.02% CPU utilization 154/256/505 90/95/99% latencies 1.13042e+06 tps Time in functions: 0.28% skb_flow_dissect 0.21% __skb_get_hash After fix: 94.95% CPU utilization 156/254/485 90/95/99% latencies 1.15447e+06 Neither __skb_get_hash nor skb_flow_dissect appear in perf Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip.h | 14 ++++++++++++++ include/net/ipv6.h | 15 +++++++++++++++ include/net/sock.h | 11 +++++++++++ net/ipv4/datagram.c | 1 + net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/tcp_output.c | 1 + net/ipv6/datagram.c | 1 + net/ipv6/tcp_ipv6.c | 4 ++++ 8 files changed, 50 insertions(+) diff --git a/include/net/ip.h b/include/net/ip.h index 0e795df05ec9..2e8f055989c3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -31,6 +31,7 @@ #include #include #include +#include struct sock; @@ -353,6 +354,19 @@ static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) skb->len, proto, 0); } +static inline void inet_set_txhash(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct flow_keys keys; + + keys.src = inet->inet_saddr; + keys.dst = inet->inet_daddr; + keys.port16[0] = inet->inet_sport; + keys.port16[1] = inet->inet_dport; + + sk->sk_txhash = flow_hash_from_keys(&keys); +} + /* * Map a multicast IP onto multicast MAC for type ethernet. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 574337fe72dd..2aa86e1135a1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #define SIN6_LEN_RFC2133 24 @@ -684,6 +685,20 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +static inline void ip6_set_txhash(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flow_keys keys; + + keys.src = (__force __be32)ipv6_addr_hash(&np->saddr); + keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); + keys.port16[0] = inet->inet_sport; + keys.port16[1] = inet->inet_dport; + + sk->sk_txhash = flow_hash_from_keys(&keys); +} + /* * Header manipulation */ diff --git a/include/net/sock.h b/include/net/sock.h index 8d4c9473e7d7..cb84b2f1ad8f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -273,6 +273,7 @@ struct cg_proto; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_rxhash: flow hash received from netif layer + * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer @@ -347,6 +348,7 @@ struct sock { #ifdef CONFIG_RPS __u32 sk_rxhash; #endif + __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_napi_id; unsigned int sk_ll_usec; @@ -1980,6 +1982,14 @@ static inline void sock_poll_wait(struct file *filp, } } +static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) +{ + if (sk->sk_txhash) { + skb->l4_hash = 1; + skb->hash = sk->sk_txhash; + } +} + /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in @@ -1994,6 +2004,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) skb_orphan(skb); skb->sk = sk; skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() wont free this sock until diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index a3095fdefbed..90c0e8386116 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; + inet_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 52d0f6a1ec2c..1edc739b9da5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; + inet_set_txhash(sk); + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; @@ -1334,6 +1336,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; + inet_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f8f2a944a1ce..bcee13c4627c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; skb->destructor = tcp_wfree; + skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c3bf2d2e519e..2753319524f1 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -199,6 +199,7 @@ ipv4_connected: NULL); sk->sk_state = TCP_ESTABLISHED; + ip6_set_txhash(sk); out: fl6_sock_release(flowlabel); return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a97c95585da8..22055b098428 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -198,6 +198,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; + ip6_set_txhash(sk); + /* * TCP over IPv4 */ @@ -1132,6 +1134,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; + ip6_set_txhash(newsk); + /* Now IPv6 options... First: no IPv4 options. From 0e001614e849b68cff94cda8db8b550569d3dba6 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:27 -0700 Subject: [PATCH 3/8] net: Call skb_get_hash in get_xps_queue and __skb_tx_hash Call standard function to get a packet hash instead of taking this from skb->sk->sk_hash or only using skb->protocol. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/linux/skbuff.h | 2 +- net/core/flow_dissector.c | 29 +++++------------------------ 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 66f9a04ec270..8b43a28ee0bc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2486,7 +2486,7 @@ static inline int netif_set_xps_queue(struct net_device *dev, * as a distribution range limit for the returned value. */ static inline u16 skb_tx_hash(const struct net_device *dev, - const struct sk_buff *skb) + struct sk_buff *skb) { return __skb_tx_hash(dev, skb, dev->real_num_tx_queues); } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ec89301ada41..b297af70ac30 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3005,7 +3005,7 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) return skb->queue_mapping != 0; } -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, unsigned int num_tx_queues); static inline struct sec_path *skb_sec_path(struct sk_buff *skb) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2ff8cd4dfc5f..62d1cb624f53 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -196,12 +196,6 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) return jhash_3words(a, b, c, hashrnd); } -static __always_inline u32 __flow_hash_1word(u32 a) -{ - __flow_hash_secret_init(); - return jhash_1word(a, hashrnd); -} - static inline u32 __flow_hash_from_keys(struct flow_keys *keys) { u32 hash; @@ -253,7 +247,7 @@ EXPORT_SYMBOL(__skb_get_hash); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, unsigned int num_tx_queues) { u32 hash; @@ -273,13 +267,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = __flow_hash_1word(hash); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; + return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); @@ -351,17 +339,10 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map) { if (map->len == 1) queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->hash; - hash = __flow_hash_1word(hash); + else queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } + ((u64)skb_get_hash(skb) * map->len) >> 32]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } From b8f1a55639e6a76cfd274cc7de76eafac9a15ca9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:39 -0700 Subject: [PATCH 4/8] udp: Add function to make source port for UDP tunnels This patch adds udp_flow_src_port function which is intended to be a common function that UDP tunnel implementations call to set the source port. The source port is chosen so that a hash over the outer headers (IP addresses and UDP ports) acts as suitable hash for the flow of the encapsulated packet. In this manner, UDP encapsulation works with RSS and ECMP based wrt the inner flow. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/udp.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/net/udp.h b/include/net/udp.h index 68a1fefe3dfe..70f941368ace 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -176,6 +176,35 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*)(const struct sock *, const struct sock *), unsigned int hash2_nulladdr); +static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, + int min, int max, bool use_eth) +{ + u32 hash; + + if (min >= max) { + /* Use default range */ + inet_get_local_port_range(net, &min, &max); + } + + hash = skb_get_hash(skb); + if (unlikely(!hash) && use_eth) { + /* Can't find a normal hash, caller has indicated an Ethernet + * packet so use that to compute a hash. + */ + hash = jhash(skb->data, 2 * ETH_ALEN, + (__force u32) skb->protocol); + } + + /* Since this is being sent on the wire obfuscate hash a bit + * to minimize possbility that any useful information to an + * attacker is leaked. Only upper 16 bits are relevant in the + * computation for 16 bit port value. + */ + hash ^= hash << 16; + + return htons((((u64) hash * (max - min)) >> 32) + min); +} + /* net/ipv4/udp.c */ void udp_v4_early_demux(struct sk_buff *skb); int udp_get_port(struct sock *sk, unsigned short snum, From 535fb8d006bc6a96d59558181a9a6f267be382c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:49 -0700 Subject: [PATCH 5/8] vxlan: Call udp_flow_src_port In vxlan and OVS vport-vxlan call common function to get source port for a UDP tunnel. Removed vxlan_src_port since the functionality is now in udp_flow_src_port. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 26 ++------------------------ include/net/vxlan.h | 2 -- net/openvswitch/vport-vxlan.c | 5 +---- 3 files changed, 3 insertions(+), 30 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ade33ef82823..c2d360150804 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1570,25 +1570,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; } -/* Compute source port for outgoing packet - * first choice to use L4 flow hash since it will spread - * better and maybe available from hardware - * secondary choice is to use jhash on the Ethernet header - */ -__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb) -{ - unsigned int range = (port_max - port_min) + 1; - u32 hash; - - hash = skb_get_hash(skb); - if (!hash) - hash = jhash(skb->data, 2 * ETH_ALEN, - (__force u32) skb->protocol); - - return htons((((u64) hash * range) >> 32) + port_min); -} -EXPORT_SYMBOL_GPL(vxlan_src_port); - static inline struct sk_buff *vxlan_handle_offloads(struct sk_buff *skb, bool udp_csum) { @@ -1807,7 +1788,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); - src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb); + src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min, + vxlan->port_max, true); if (dst->sa.sa_family == AF_INET) { memset(&fl4, 0, sizeof(fl4)); @@ -2235,7 +2217,6 @@ static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; - int low, high; eth_hw_addr_random(dev); ether_setup(dev); @@ -2272,9 +2253,6 @@ static void vxlan_setup(struct net_device *dev) vxlan->age_timer.function = vxlan_cleanup; vxlan->age_timer.data = (unsigned long) vxlan; - inet_get_local_port_range(dev_net(dev), &low, &high); - vxlan->port_min = low; - vxlan->port_max = high; vxlan->dst_port = htons(vxlan_port); vxlan->dev = dev; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 12196ce661d9..d5f59f3fc35d 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -45,8 +45,6 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be32 vni, bool xnet); -__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); - /* IP header + UDP + VXLAN + Ethernet header */ #define VXLAN_HEADROOM (20 + 8 + 8 + 14) /* IPv6 header + UDP + VXLAN + Ethernet header */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 0edbd95c60e7..d8b7e247bebf 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -143,8 +143,6 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct rtable *rt; struct flowi4 fl; __be16 src_port; - int port_min; - int port_max; __be16 df; int err; @@ -172,8 +170,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) skb->ignore_df = 1; - inet_get_local_port_range(net, &port_min, &port_max); - src_port = vxlan_src_port(port_min, port_max, skb); + src_port = udp_flow_src_port(net, skb, 0, 0, true); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, From 19469a873bafd4e65daef3597db2bd724c1b03c9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:01 -0700 Subject: [PATCH 6/8] flow_dissector: Use IPv6 flow label in flow_dissector This patch implements the receive side to support RFC 6438 which is to use the flow label as an ECMP hash. If an IPv6 flow label is set in a packet we can use this as input for computing an L4-hash. There should be no need to parse any transport headers in this case. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 62d1cb624f53..c5f3912dad4c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -80,6 +80,8 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; + __be32 flow_label; + ipv6: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph) @@ -89,6 +91,21 @@ ipv6: flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + + flow_label = ip6_flowlabel(iph); + if (flow_label) { + /* Awesome, IPv6 packet has a flow label so we can + * use that to represent the ports without any + * further dissection. + */ + flow->n_proto = proto; + flow->ip_proto = ip_proto; + flow->ports = flow_label; + flow->thoff = (u16)nhoff; + + return true; + } + break; } case htons(ETH_P_8021AD): From cb1ce2ef387b01686469487edd45994872d52d73 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:10 -0700 Subject: [PATCH 7/8] ipv6: Implement automatic flow label generation on transmit Automatically generate flow labels for IPv6 packets on transmit. The flow label is computed based on skb_get_hash. The flow label will only automatically be set when it is zero otherwise (i.e. flow label manager hasn't set one). This supports the transmit side functionality of RFC 6438. Added an IPv6 sysctl auto_flowlabels to enable/disable this behavior system wide, and added IPV6_AUTOFLOWLABEL socket option to enable this functionality per socket. By default, auto flowlabels are disabled to avoid possible conflicts with flow label manager, however if this feature proves useful we may want to enable it by default. It should also be noted that FreeBSD has already implemented automatic flow labels (including the sysctl and socket option). In FreeBSD, automatic flow labels default to enabled. Performance impact: Running super_netperf with 200 flows for TCP_RR and UDP_RR for IPv6. Note that in UDP case, __skb_get_hash will be called for every packet with explains slight regression. In the TCP case the hash is saved in the socket so there is no regression. Automatic flow labels disabled: TCP_RR: 86.53% CPU utilization 127/195/322 90/95/99% latencies 1.40498e+06 tps UDP_RR: 90.70% CPU utilization 118/168/243 90/95/99% latencies 1.50309e+06 tps Automatic flow labels enabled: TCP_RR: 85.90% CPU utilization 128/199/337 90/95/99% latencies 1.40051e+06 UDP_RR 92.61% CPU utilization 115/164/236 90/95/99% latencies 1.4687e+06 Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/linux/ipv6.h | 3 ++- include/net/ipv6.h | 20 ++++++++++++++++++++ include/net/netns/ipv6.h | 1 + include/uapi/linux/in6.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/ip6_gre.c | 7 +++++-- net/ipv6/ip6_output.c | 7 +++++-- net/ipv6/ip6_tunnel.c | 3 ++- net/ipv6/ipv6_sockglue.c | 8 ++++++++ net/ipv6/sysctl_net_ipv6.c | 8 ++++++++ 11 files changed, 62 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 10e216c6e05e..f35bfe43bf7a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1132,6 +1132,15 @@ flowlabel_consistency - BOOLEAN FALSE: disabled Default: TRUE +auto_flowlabels - BOOLEAN + Automatically generate flow labels based based on a flow hash + of the packet. This allows intermediate devices, such as routers, + to idenfify packet flows for mechanisms like Equal Cost Multipath + Routing (see RFC 6438). + TRUE: enabled + FALSE: disabled + Default: false + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 5dc68c3ebcbd..ff560537dd61 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -199,7 +199,8 @@ struct ipv6_pinfo { * 010: prefer public address * 100: prefer care-of address */ - dontfrag:1; + dontfrag:1, + autoflowlabel:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2aa86e1135a1..4308f2ada8b3 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -699,6 +699,26 @@ static inline void ip6_set_txhash(struct sock *sk) sk->sk_txhash = flow_hash_from_keys(&keys); } +static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, + __be32 flowlabel, bool autolabel) +{ + if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { + __be32 hash; + + hash = skb_get_hash(skb); + + /* Since this is being sent on the wire obfuscate hash a bit + * to minimize possbility that any useful information to an + * attacker is leaked. Only lower 20 bits are relevant. + */ + hash ^= hash >> 12; + + flowlabel = hash & IPV6_FLOWLABEL_MASK; + } + + return flowlabel; +} + /* * Header manipulation */ diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 19d3446e59d2..eade27adecf3 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { int ip6_rt_mtu_expires; int ip6_rt_min_advmss; int flowlabel_consistency; + int auto_flowlabels; int icmpv6_time; int anycast_src_echo_reply; int fwmark_reflect; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 0d8e0f0342dc..22b7a69619d8 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -233,6 +233,7 @@ struct in6_flowlabel_req { #if 0 /* not yet */ #define IPV6_USE_MIN_MTU 63 #endif +#define IPV6_AUTOFLOWLABEL 64 /* * Netfilter (1) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a426cd7099bb..2daa3a133e49 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -765,6 +765,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; + net->ipv6.sysctl.auto_flowlabels = 0; atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3873181ed856..365b2b6f3942 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -723,7 +723,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, * Push down and install the IP header. */ ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1174,7 +1175,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); __be16 *p = (__be16 *)(ipv6h+1); - ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ip6_flow_hdr(ipv6h, 0, + ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cb9df0eb4023..fa83bdd4c3dd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -205,7 +205,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - ip6_flow_hdr(hdr, tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1569,7 +1570,9 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, np->cork.tclass, + ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index afa082458360..51a1eb185ea7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1046,7 +1046,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index cc34f65179e4..b50b9e54cf53 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -834,6 +834,10 @@ pref_skip_coa: np->dontfrag = valbool; retv = 0; break; + case IPV6_AUTOFLOWLABEL: + np->autoflowlabel = valbool; + retv = 0; + break; } release_sock(sk); @@ -1273,6 +1277,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->dontfrag; break; + case IPV6_AUTOFLOWLABEL: + val = np->autoflowlabel; + break; + default: return -ENOPROTOOPT; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 058f3eca2e53..5bf7b61f8ae8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -38,6 +38,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "auto_flowlabels", + .data = &init_net.ipv6.sysctl.auto_flowlabels, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, @@ -74,6 +81,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; + ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) From a3b18ddb9cc1056eea24e3edc1828cfb3fd0726f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:17 -0700 Subject: [PATCH 8/8] net: Only do flow_dissector hash computation once per packet Add sw_hash flag to skbuff to indicate that skb->hash was computed from flow_dissector. This flag is checked in skb_get_hash to avoid repeatedly trying to compute the hash (ie. in the case that no L4 hash can be computed). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++-- net/core/flow_dissector.c | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b297af70ac30..890fb3307dd6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -455,6 +455,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport * ports. + * @sw_hash: indicates hash was computed in software stack * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS @@ -562,6 +563,7 @@ struct sk_buff { __u8 pfmemalloc:1; __u8 ooo_okay:1; __u8 l4_hash:1; + __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; __u8 no_fcs:1; @@ -575,7 +577,7 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; - /* 3/5 bit hole (depending on ndisc_nodetype presence) */ + /* 2/4 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL @@ -830,13 +832,14 @@ static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) { skb->l4_hash = (type == PKT_HASH_TYPE_L4); + skb->sw_hash = 0; skb->hash = hash; } void __skb_get_hash(struct sk_buff *skb); static inline __u32 skb_get_hash(struct sk_buff *skb) { - if (!skb->l4_hash) + if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash; @@ -850,6 +853,7 @@ static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) static inline void skb_clear_hash(struct sk_buff *skb) { skb->hash = 0; + skb->sw_hash = 0; skb->l4_hash = 0; } @@ -862,6 +866,7 @@ static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; + to->sw_hash = from->sw_hash; to->l4_hash = from->l4_hash; }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c5f3912dad4c..5f362c1d0332 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -256,6 +256,8 @@ void __skb_get_hash(struct sk_buff *skb) if (keys.ports) skb->l4_hash = 1; + skb->sw_hash = 1; + skb->hash = __flow_hash_from_keys(&keys); } EXPORT_SYMBOL(__skb_get_hash);