From 87d5c18ce1f41a2410d4fb26d5d68c55867450f5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:34 +0200 Subject: [PATCH 1/5] netfilter: cleanup struct nf_hook_ops indentation Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 63560d0a8dfe..83be4a3cec98 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -79,16 +79,16 @@ typedef unsigned int nf_hookfn(const struct nf_hook_ops *ops, const struct nf_hook_state *state); struct nf_hook_ops { - struct list_head list; + struct list_head list; /* User fills in from here down. */ - nf_hookfn *hook; - struct module *owner; - void *priv; - u_int8_t pf; - unsigned int hooknum; + nf_hookfn *hook; + struct module *owner; + void *priv; + u_int8_t pf; + unsigned int hooknum; /* Hooks are ordered in ascending priority. */ - int priority; + int priority; }; struct nf_sockopt_ops { From f7191483461ce2ae579b6f7227fa7ce49e006656 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:35 +0200 Subject: [PATCH 2/5] netfilter: add hook list to nf_hook_state Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 7 +++++-- net/netfilter/core.c | 6 ++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 83be4a3cec98..388ed1952242 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -54,10 +54,12 @@ struct nf_hook_state { struct net_device *in; struct net_device *out; struct sock *sk; + struct list_head *hook_list; int (*okfn)(struct sock *, struct sk_buff *); }; static inline void nf_hook_state_init(struct nf_hook_state *p, + struct list_head *hook_list, unsigned int hook, int thresh, u_int8_t pf, struct net_device *indev, @@ -71,6 +73,7 @@ static inline void nf_hook_state_init(struct nf_hook_state *p, p->in = indev; p->out = outdev; p->sk = sk; + p->hook_list = hook_list; p->okfn = okfn; } @@ -166,8 +169,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, if (nf_hooks_active(pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, hook, thresh, pf, - indev, outdev, sk, okfn); + nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh, + pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } return 1; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index e6163017c42d..e418cfd603c0 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -166,11 +166,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = list_entry_rcu(&nf_hooks[state->pf][state->hook], - struct nf_hook_ops, list); + elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); next_hook: - verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state, - &elem); + verdict = nf_iterate(state->hook_list, skb, state, &elem); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { From b8d0aad0c77f488d1d51a02d871a5cbc2d8032b9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:36 +0200 Subject: [PATCH 3/5] netfilter: add nf_hook_list_active() In preparation to have netfilter ingress per-device hook list. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 388ed1952242..49d00638d1fa 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -134,26 +134,33 @@ extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; -static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) +static inline bool nf_hook_list_active(struct list_head *nf_hook_list, + u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) return static_key_false(&nf_hooks_needed[pf][hook]); - return !list_empty(&nf_hooks[pf][hook]); + return !list_empty(nf_hook_list); } #else -static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) +static inline bool nf_hook_list_active(struct list_head *nf_hook_list, + u_int8_t pf, unsigned int hook) { - return !list_empty(&nf_hooks[pf][hook]); + return !list_empty(nf_hook_list); } #endif +static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) +{ + return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook); +} + int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state); /** * nf_hook_thresh - call a netfilter hook - * + * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. From 1cf51900f8545b358b5deaacfda348d990f671db Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:37 +0200 Subject: [PATCH 4/5] net: add CONFIG_NET_INGRESS to enable ingress filtering This new config switch enables the ingress filtering infrastructure that is controlled through the ingress_needed static key. This prepares the introduction of the Netfilter ingress hook that resides under this unique static key. Note that CONFIG_SCH_INGRESS automatically selects this, that should be no problem since this also depends on CONFIG_NET_CLS_ACT. Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- net/Kconfig | 3 +++ net/core/dev.c | 7 ++++--- net/sched/Kconfig | 1 + 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index bd29ab4b0941..a2324fb45cf4 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -79,7 +79,7 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif diff --git a/net/Kconfig b/net/Kconfig index 44dd5786ee91..57a7c5af3175 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES Newly written code should NEVER need this option but do compat-independent messages instead! +config NET_INGRESS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/dev.c b/net/core/dev.c index af549062ae8e..a5ef90016ce7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1630,7 +1630,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS static struct static_key ingress_needed __read_mostly; void net_inc_ingress_queue(void) @@ -3798,13 +3798,14 @@ another_round: } skip_taps: -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto unlock; } - +#endif +#ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; ncls: #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 5fd1c2f487d2..daa33432b716 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -312,6 +312,7 @@ config NET_SCH_PIE config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT + select NET_INGRESS ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:38 +0200 Subject: [PATCH 5/5] netfilter: add netfilter ingress hook after handle_ing() under unique static key This patch adds the Netfilter ingress hook just after the existing tc ingress hook, that seems to be the consensus solution for this. Note that the Netfilter hook resides under the global static key that enables ingress filtering. Nonetheless, Netfilter still also has its own static key for minimal impact on the existing handle_ing(). * Without this patch: Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags) 16086246pps 7721Mb/sec (7721398080bps) errors: 100000000 42.46% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 25.92% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.81% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.62% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.70% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.34% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.44% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch: Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags) 16090536pps 7723Mb/sec (7723457280bps) errors: 100000000 41.23% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 26.57% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.72% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.55% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.78% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.06% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.43% kpktgend_0 [kernel.kallsyms] [k] __build_skb * Without this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags) 10788648pps 5178Mb/sec (5178551040bps) errors: 100000000 40.99% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.50% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.77% kpktgend_0 [cls_u32] [k] u32_classify 5.62% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.18% kpktgend_0 [pktgen] [k] pktgen_thread_worker 3.23% kpktgend_0 [kernel.kallsyms] [k] tc_classify 2.97% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 1.83% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.50% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 0.99% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags) 10743194pps 5156Mb/sec (5156733120bps) errors: 100000000 42.01% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.78% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.70% kpktgend_0 [cls_u32] [k] u32_classify 5.46% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.16% kpktgend_0 [pktgen] [k] pktgen_thread_worker 2.98% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.84% kpktgend_0 [kernel.kallsyms] [k] tc_classify 1.96% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.57% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk Note that the results are very similar before and after. I can see gcc gets the code under the ingress static key out of the hot path. Then, on that cold branch, it generates the code to accomodate the netfilter ingress static key. My explanation for this is that this reduces the pressure on the instruction cache for non-users as the new code is out of the hot path, and it comes with minimal impact for tc ingress users. Using gcc version 4.8.4 on: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 [...] L1d cache: 16K L1i cache: 64K L2 cache: 2048K L3 cache: 8192K Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/netfilter.h | 1 + include/linux/netfilter_ingress.h | 41 +++++++++++++++++++++++++++++++ include/uapi/linux/netfilter.h | 6 +++++ net/core/dev.c | 36 +++++++++++++++++++++++++++ net/netfilter/Kconfig | 7 ++++++ net/netfilter/core.c | 31 ++++++++++++++++++++++- 7 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 include/linux/netfilter_ingress.h diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d3ed01c18247..51f8d2f5dc3f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1656,6 +1656,9 @@ struct net_device { struct tcf_proto __rcu *ingress_cl_list; #endif struct netdev_queue __rcu *ingress_queue; +#ifdef CONFIG_NETFILTER_INGRESS + struct list_head nf_hooks_ingress; +#endif unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 49d00638d1fa..f5ff5d156da8 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -86,6 +86,7 @@ struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; + struct net_device *dev; struct module *owner; void *priv; u_int8_t pf; diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h new file mode 100644 index 000000000000..cb0727fe2b3d --- /dev/null +++ b/include/linux/netfilter_ingress.h @@ -0,0 +1,41 @@ +#ifndef _NETFILTER_INGRESS_H_ +#define _NETFILTER_INGRESS_H_ + +#include +#include + +#ifdef CONFIG_NETFILTER_INGRESS +static inline int nf_hook_ingress_active(struct sk_buff *skb) +{ + return nf_hook_list_active(&skb->dev->nf_hooks_ingress, + NFPROTO_NETDEV, NF_NETDEV_INGRESS); +} + +static inline int nf_hook_ingress(struct sk_buff *skb) +{ + struct nf_hook_state state; + + nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, + NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, NULL, + skb->dev, NULL, NULL); + return nf_hook_slow(skb, &state); +} + +static inline void nf_hook_ingress_init(struct net_device *dev) +{ + INIT_LIST_HEAD(&dev->nf_hooks_ingress); +} +#else /* CONFIG_NETFILTER_INGRESS */ +static inline int nf_hook_ingress_active(struct sk_buff *skb) +{ + return 0; +} + +static inline int nf_hook_ingress(struct sk_buff *skb) +{ + return 0; +} + +static inline void nf_hook_ingress_init(struct net_device *dev) {} +#endif /* CONFIG_NETFILTER_INGRESS */ +#endif /* _NETFILTER_INGRESS_H_ */ diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ef1b1f88ca18..177027cce6b3 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -51,11 +51,17 @@ enum nf_inet_hooks { NF_INET_NUMHOOKS }; +enum nf_dev_hooks { + NF_NETDEV_INGRESS, + NF_NETDEV_NUMHOOKS +}; + enum { NFPROTO_UNSPEC = 0, NFPROTO_INET = 1, NFPROTO_IPV4 = 2, NFPROTO_ARP = 3, + NFPROTO_NETDEV = 5, NFPROTO_BRIDGE = 7, NFPROTO_IPV6 = 10, NFPROTO_DECNET = 12, diff --git a/net/core/dev.c b/net/core/dev.c index a5ef90016ce7..29f0d6e6542c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,6 +135,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -3666,6 +3667,13 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, return skb; } +#else +static inline struct sk_buff *handle_ing(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + return skb; +} #endif /** @@ -3739,6 +3747,28 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) } } +#ifdef CONFIG_NETFILTER_INGRESS +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + if (nf_hook_ingress_active(skb)) { + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + return nf_hook_ingress(skb); + } + return 0; +} +#else +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + return 0; +} +#endif + static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; @@ -3803,6 +3833,9 @@ skip_taps: skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto unlock; + + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto unlock; } #endif #ifdef CONFIG_NET_CLS_ACT @@ -6968,6 +7001,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_ingress_init(dev); + return dev; free_all: diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f70e34a68f70..db1c674397ad 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,6 +1,13 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER +config NETFILTER_INGRESS + bool "Netfilter ingress support" + select NET_INGRESS + help + This allows you to classify packets from ingress using the Netfilter + infrastructure. + config NETFILTER_NETLINK tristate diff --git a/net/netfilter/core.c b/net/netfilter/core.c index e418cfd603c0..653e32eac08c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -64,10 +64,27 @@ static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { + struct list_head *nf_hook_list; struct nf_hook_ops *elem; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + switch (reg->pf) { + case NFPROTO_NETDEV: +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { + BUG_ON(reg->dev == NULL); + nf_hook_list = ®->dev->nf_hooks_ingress; + net_inc_ingress_queue(); + break; + } +#endif + /* Fall through. */ + default: + nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + break; + } + + list_for_each_entry(elem, nf_hook_list, list) { if (reg->priority < elem->priority) break; } @@ -85,6 +102,18 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); + switch (reg->pf) { + case NFPROTO_NETDEV: +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { + net_dec_ingress_queue(); + break; + } + break; +#endif + default: + break; + } #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif