This is the 4.4.174 stable release

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAlxdWR4ACgkQONu9yGCS
 aT5Y0Q//YFvFKlYGFDE3yg3Q81b4O49bkO5bpgGqGs6cn7vfY4WxlKXDa4olHKdk
 DLsehAjLRV0MlJfV9kfPT03nCecyjzs8m4AH7OwqyujW2ZCM+YCOR2gx1fCK5KNb
 twO7mTIBKv38T2ilGLYqXBf1pha9DA2RMMKWCuMRIOagC/OYsfq2RkSnKH2p0gFP
 plsSpEYJ5rtVbk2Dxnf5y+simJmtzyiHoIBZJZq7tCRVT9XhJqMbxHeGFUwTFj7W
 AdWAx/zWM/OBe+NvSmqIdaiYxaNb91RjfeMZQrafRS/KcgsD80nNmT6Kk07NXfnT
 3eUHq2i+S8bokadcfcjA6UhT48kqh79vyllm71DeaNkuvaapPxkYYKESeNpeOcop
 06MyENBwUYrTCkuc3raC/0FLJ7Csxoe51V6M9VdQjtsvnX35DcX+9YiwGn32N5h/
 q9qdXJH6TaYhSGQozcAVhHWl5U1Nl76vw0LQXagvvUqJ4lCZVlYCptwzr7e2A6/Y
 WQQeFwUSp4Niw0m2HXmBP9unIzt5MhjknKrb3z962S48Ie4hM8LC/g/jwhFOrj6U
 XxuatqiUbjt8yyteSd1gVf82vjkDqR1YLk6qXFwvEJpPtZ7DmOQ8CgE2VLS+rbXP
 xFz5bZXuvW7kgqdm41DjHWqq8rT/81pooeGUPSLhY4VMUQ58poE=
 =hOLk
 -----END PGP SIGNATURE-----

Merge 4.4.174 into android-4.4

Changes in 4.4.174
	inet: frags: change inet_frags_init_net() return value
	inet: frags: add a pointer to struct netns_frags
	inet: frags: refactor ipfrag_init()
	inet: frags: refactor ipv6_frag_init()
	inet: frags: refactor lowpan_net_frag_init()
	rhashtable: add rhashtable_lookup_get_insert_key()
	rhashtable: Add rhashtable_lookup()
	rhashtable: add schedule points
	inet: frags: use rhashtables for reassembly units
	net: ieee802154: 6lowpan: fix frag reassembly
	ipfrag: really prevent allocation on netns exit
	inet: frags: remove some helpers
	inet: frags: get rif of inet_frag_evicting()
	inet: frags: remove inet_frag_maybe_warn_overflow()
	inet: frags: break the 2GB limit for frags storage
	inet: frags: do not clone skb in ip_expire()
	ipv6: frags: rewrite ip6_expire_frag_queue()
	rhashtable: reorganize struct rhashtable layout
	inet: frags: reorganize struct netns_frags
	inet: frags: get rid of ipfrag_skb_cb/FRAG_CB
	inet: frags: fix ip6frag_low_thresh boundary
	ip: discard IPv4 datagrams with overlapping segments.
	net: modify skb_rbtree_purge to return the truesize of all purged skbs.
	ipv6: defrag: drop non-last frags smaller than min mtu
	net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends
	ip: use rb trees for IP frag queue.
	ip: add helpers to process in-order fragments faster.
	ip: process in-order fragments efficiently
	ip: frags: fix crash in ip_do_fragment()
	ipv4: frags: precedence bug in ip_expire()
	inet: frags: better deal with smp races
	net: fix pskb_trim_rcsum_slow() with odd trim offset
	net: ipv4: do not handle duplicate fragments as overlapping
	rcu: Force boolean subscript for expedited stall warnings
	Linux 4.4.174

Change-Id: I47eace4f47ffe0bf16b29615d09ed903c40a272b
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman 2019-02-08 11:47:29 +01:00
commit 62872f952d
19 changed files with 874 additions and 968 deletions

View file

@ -112,14 +112,11 @@ min_adv_mss - INTEGER
IP Fragmentation: IP Fragmentation:
ipfrag_high_thresh - INTEGER ipfrag_high_thresh - LONG INTEGER
Maximum memory used to reassemble IP fragments. When Maximum memory used to reassemble IP fragments.
ipfrag_high_thresh bytes of memory is allocated for this purpose,
the fragment handler will toss packets until ipfrag_low_thresh
is reached. This also serves as a maximum limit to namespaces
different from the initial one.
ipfrag_low_thresh - INTEGER ipfrag_low_thresh - LONG INTEGER
(Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources. begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation. The kernel still accepts new fragments for defragmentation.

View file

@ -1,6 +1,6 @@
VERSION = 4 VERSION = 4
PATCHLEVEL = 4 PATCHLEVEL = 4
SUBLEVEL = 173 SUBLEVEL = 174
EXTRAVERSION = EXTRAVERSION =
NAME = Blurry Fish Butt NAME = Blurry Fish Butt

View file

@ -133,23 +133,23 @@ struct rhashtable_params {
/** /**
* struct rhashtable - Hash table handle * struct rhashtable - Hash table handle
* @tbl: Bucket table * @tbl: Bucket table
* @nelems: Number of elements in table
* @key_len: Key length for hashfn * @key_len: Key length for hashfn
* @elasticity: Maximum chain length before rehash * @elasticity: Maximum chain length before rehash
* @p: Configuration parameters * @p: Configuration parameters
* @run_work: Deferred worker to expand/shrink asynchronously * @run_work: Deferred worker to expand/shrink asynchronously
* @mutex: Mutex to protect current/future table swapping * @mutex: Mutex to protect current/future table swapping
* @lock: Spin lock to protect walker list * @lock: Spin lock to protect walker list
* @nelems: Number of elements in table
*/ */
struct rhashtable { struct rhashtable {
struct bucket_table __rcu *tbl; struct bucket_table __rcu *tbl;
atomic_t nelems;
unsigned int key_len; unsigned int key_len;
unsigned int elasticity; unsigned int elasticity;
struct rhashtable_params p; struct rhashtable_params p;
struct work_struct run_work; struct work_struct run_work;
struct mutex mutex; struct mutex mutex;
spinlock_t lock; spinlock_t lock;
atomic_t nelems;
}; };
/** /**
@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *ht,
struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
const void *key, const void *key,
struct rhash_head *obj, struct rhash_head *obj,
struct bucket_table *old_tbl); struct bucket_table *old_tbl,
void **data);
int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter); int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
@ -514,18 +515,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
} }
/** /* Internal function, do not use. */
* rhashtable_lookup_fast - search hash table, inlined version static inline struct rhash_head *__rhashtable_lookup(
* @ht: hash table
* @key: the pointer to the key
* @params: hash table parameters
*
* Computes the hash value for the key and traverses the bucket chain looking
* for a entry with an identical key. The first matching entry is returned.
*
* Returns the first entry on which the compare function returned true.
*/
static inline void *rhashtable_lookup_fast(
struct rhashtable *ht, const void *key, struct rhashtable *ht, const void *key,
const struct rhashtable_params params) const struct rhashtable_params params)
{ {
@ -537,8 +528,6 @@ static inline void *rhashtable_lookup_fast(
struct rhash_head *he; struct rhash_head *he;
unsigned int hash; unsigned int hash;
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht); tbl = rht_dereference_rcu(ht->tbl, ht);
restart: restart:
hash = rht_key_hashfn(ht, tbl, key, params); hash = rht_key_hashfn(ht, tbl, key, params);
@ -547,8 +536,7 @@ restart:
params.obj_cmpfn(&arg, rht_obj(ht, he)) : params.obj_cmpfn(&arg, rht_obj(ht, he)) :
rhashtable_compare(&arg, rht_obj(ht, he))) rhashtable_compare(&arg, rht_obj(ht, he)))
continue; continue;
rcu_read_unlock(); return he;
return rht_obj(ht, he);
} }
/* Ensure we see any new tables. */ /* Ensure we see any new tables. */
@ -557,13 +545,64 @@ restart:
tbl = rht_dereference_rcu(tbl->future_tbl, ht); tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(tbl)) if (unlikely(tbl))
goto restart; goto restart;
rcu_read_unlock();
return NULL; return NULL;
} }
/* Internal function, please use rhashtable_insert_fast() instead */ /**
static inline int __rhashtable_insert_fast( * rhashtable_lookup - search hash table
* @ht: hash table
* @key: the pointer to the key
* @params: hash table parameters
*
* Computes the hash value for the key and traverses the bucket chain looking
* for a entry with an identical key. The first matching entry is returned.
*
* This must only be called under the RCU read lock.
*
* Returns the first entry on which the compare function returned true.
*/
static inline void *rhashtable_lookup(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
struct rhash_head *he = __rhashtable_lookup(ht, key, params);
return he ? rht_obj(ht, he) : NULL;
}
/**
* rhashtable_lookup_fast - search hash table, without RCU read lock
* @ht: hash table
* @key: the pointer to the key
* @params: hash table parameters
*
* Computes the hash value for the key and traverses the bucket chain looking
* for a entry with an identical key. The first matching entry is returned.
*
* Only use this function when you have other mechanisms guaranteeing
* that the object won't go away after the RCU read lock is released.
*
* Returns the first entry on which the compare function returned true.
*/
static inline void *rhashtable_lookup_fast(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
void *obj;
rcu_read_lock();
obj = rhashtable_lookup(ht, key, params);
rcu_read_unlock();
return obj;
}
/* Internal function, please use rhashtable_insert_fast() instead. This
* function returns the existing element already in hashes in there is a clash,
* otherwise it returns an error via ERR_PTR().
*/
static inline void *__rhashtable_insert_fast(
struct rhashtable *ht, const void *key, struct rhash_head *obj, struct rhashtable *ht, const void *key, struct rhash_head *obj,
const struct rhashtable_params params) const struct rhashtable_params params)
{ {
@ -576,6 +615,7 @@ static inline int __rhashtable_insert_fast(
spinlock_t *lock; spinlock_t *lock;
unsigned int elasticity; unsigned int elasticity;
unsigned int hash; unsigned int hash;
void *data = NULL;
int err; int err;
restart: restart:
@ -600,11 +640,14 @@ restart:
new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(new_tbl)) { if (unlikely(new_tbl)) {
tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data);
if (!IS_ERR_OR_NULL(tbl)) if (!IS_ERR_OR_NULL(tbl))
goto slow_path; goto slow_path;
err = PTR_ERR(tbl); err = PTR_ERR(tbl);
if (err == -EEXIST)
err = 0;
goto out; goto out;
} }
@ -618,25 +661,25 @@ slow_path:
err = rhashtable_insert_rehash(ht, tbl); err = rhashtable_insert_rehash(ht, tbl);
rcu_read_unlock(); rcu_read_unlock();
if (err) if (err)
return err; return ERR_PTR(err);
goto restart; goto restart;
} }
err = -EEXIST; err = 0;
elasticity = ht->elasticity; elasticity = ht->elasticity;
rht_for_each(head, tbl, hash) { rht_for_each(head, tbl, hash) {
if (key && if (key &&
unlikely(!(params.obj_cmpfn ? unlikely(!(params.obj_cmpfn ?
params.obj_cmpfn(&arg, rht_obj(ht, head)) : params.obj_cmpfn(&arg, rht_obj(ht, head)) :
rhashtable_compare(&arg, rht_obj(ht, head))))) rhashtable_compare(&arg, rht_obj(ht, head))))) {
data = rht_obj(ht, head);
goto out; goto out;
}
if (!--elasticity) if (!--elasticity)
goto slow_path; goto slow_path;
} }
err = 0;
head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
RCU_INIT_POINTER(obj->next, head); RCU_INIT_POINTER(obj->next, head);
@ -651,7 +694,7 @@ out:
spin_unlock_bh(lock); spin_unlock_bh(lock);
rcu_read_unlock(); rcu_read_unlock();
return err; return err ? ERR_PTR(err) : data;
} }
/** /**
@ -674,7 +717,13 @@ static inline int rhashtable_insert_fast(
struct rhashtable *ht, struct rhash_head *obj, struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params) const struct rhashtable_params params)
{ {
return __rhashtable_insert_fast(ht, NULL, obj, params); void *ret;
ret = __rhashtable_insert_fast(ht, NULL, obj, params);
if (IS_ERR(ret))
return PTR_ERR(ret);
return ret == NULL ? 0 : -EEXIST;
} }
/** /**
@ -703,11 +752,15 @@ static inline int rhashtable_lookup_insert_fast(
const struct rhashtable_params params) const struct rhashtable_params params)
{ {
const char *key = rht_obj(ht, obj); const char *key = rht_obj(ht, obj);
void *ret;
BUG_ON(ht->p.obj_hashfn); BUG_ON(ht->p.obj_hashfn);
return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params);
params); if (IS_ERR(ret))
return PTR_ERR(ret);
return ret == NULL ? 0 : -EEXIST;
} }
/** /**
@ -735,6 +788,32 @@ static inline int rhashtable_lookup_insert_fast(
static inline int rhashtable_lookup_insert_key( static inline int rhashtable_lookup_insert_key(
struct rhashtable *ht, const void *key, struct rhash_head *obj, struct rhashtable *ht, const void *key, struct rhash_head *obj,
const struct rhashtable_params params) const struct rhashtable_params params)
{
void *ret;
BUG_ON(!ht->p.obj_hashfn || !key);
ret = __rhashtable_insert_fast(ht, key, obj, params);
if (IS_ERR(ret))
return PTR_ERR(ret);
return ret == NULL ? 0 : -EEXIST;
}
/**
* rhashtable_lookup_get_insert_key - lookup and insert object into hash table
* @ht: hash table
* @obj: pointer to hash head inside object
* @params: hash table parameters
* @data: pointer to element data already in hashes
*
* Just like rhashtable_lookup_insert_key(), but this function returns the
* object if it exists, NULL if it does not and the insertion was successful,
* and an ERR_PTR otherwise.
*/
static inline void *rhashtable_lookup_get_insert_key(
struct rhashtable *ht, const void *key, struct rhash_head *obj,
const struct rhashtable_params params)
{ {
BUG_ON(!ht->p.obj_hashfn || !key); BUG_ON(!ht->p.obj_hashfn || !key);

View file

@ -556,9 +556,14 @@ struct sk_buff {
struct skb_mstamp skb_mstamp; struct skb_mstamp skb_mstamp;
}; };
}; };
struct rb_node rbnode; /* used in netem & tcp stack */ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
}; };
struct sock *sk;
union {
struct sock *sk;
int ip_defrag_offset;
};
struct net_device *dev; struct net_device *dev;
/* /*
@ -2273,7 +2278,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
kfree_skb(skb); kfree_skb(skb);
} }
void skb_rbtree_purge(struct rb_root *root); unsigned int skb_rbtree_purge(struct rb_root *root);
void *netdev_alloc_frag(unsigned int fragsz); void *netdev_alloc_frag(unsigned int fragsz);
@ -2791,6 +2796,7 @@ static inline unsigned char *skb_push_rcsum(struct sk_buff *skb,
return skb->data; return skb->data;
} }
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/** /**
* pskb_trim_rcsum - trim received skb and update checksum * pskb_trim_rcsum - trim received skb and update checksum
* @skb: buffer to trim * @skb: buffer to trim
@ -2805,9 +2811,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{ {
if (likely(len >= skb->len)) if (likely(len >= skb->len))
return 0; return 0;
if (skb->ip_summed == CHECKSUM_COMPLETE) return pskb_trim_rcsum_slow(skb, len);
skb->ip_summed = CHECKSUM_NONE;
return __pskb_trim(skb, len);
} }
#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)

View file

@ -1,13 +1,19 @@
#ifndef __NET_FRAG_H__ #ifndef __NET_FRAG_H__
#define __NET_FRAG_H__ #define __NET_FRAG_H__
#include <linux/rhashtable.h>
struct netns_frags { struct netns_frags {
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */ /* sysctls */
long high_thresh;
long low_thresh;
int timeout; int timeout;
int high_thresh; struct inet_frags *f;
int low_thresh;
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_long_t mem ____cacheline_aligned_in_smp;
}; };
/** /**
@ -23,74 +29,68 @@ enum {
INET_FRAG_COMPLETE = BIT(2), INET_FRAG_COMPLETE = BIT(2),
}; };
struct frag_v4_compare_key {
__be32 saddr;
__be32 daddr;
u32 user;
u32 vif;
__be16 id;
u16 protocol;
};
struct frag_v6_compare_key {
struct in6_addr saddr;
struct in6_addr daddr;
u32 user;
__be32 id;
u32 iif;
};
/** /**
* struct inet_frag_queue - fragment queue * struct inet_frag_queue - fragment queue
* *
* @lock: spinlock protecting the queue * @node: rhash node
* @key: keys identifying this frag.
* @timer: queue expiration timer * @timer: queue expiration timer
* @list: hash bucket list * @lock: spinlock protecting this frag
* @refcnt: reference count of the queue * @refcnt: reference count of the queue
* @fragments: received fragments head * @fragments: received fragments head
* @rb_fragments: received fragments rb-tree root
* @fragments_tail: received fragments tail * @fragments_tail: received fragments tail
* @last_run_head: the head of the last "run". see ip_fragment.c
* @stamp: timestamp of the last received fragment * @stamp: timestamp of the last received fragment
* @len: total length of the original datagram * @len: total length of the original datagram
* @meat: length of received fragments so far * @meat: length of received fragments so far
* @flags: fragment queue flags * @flags: fragment queue flags
* @max_size: maximum received fragment size * @max_size: maximum received fragment size
* @net: namespace that this frag belongs to * @net: namespace that this frag belongs to
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory) * @rcu: rcu head for freeing deferall
*/ */
struct inet_frag_queue { struct inet_frag_queue {
spinlock_t lock; struct rhash_head node;
union {
struct frag_v4_compare_key v4;
struct frag_v6_compare_key v6;
} key;
struct timer_list timer; struct timer_list timer;
struct hlist_node list; spinlock_t lock;
atomic_t refcnt; atomic_t refcnt;
struct sk_buff *fragments; struct sk_buff *fragments; /* Used in IPv6. */
struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments_tail; struct sk_buff *fragments_tail;
struct sk_buff *last_run_head;
ktime_t stamp; ktime_t stamp;
int len; int len;
int meat; int meat;
__u8 flags; __u8 flags;
u16 max_size; u16 max_size;
struct netns_frags *net; struct netns_frags *net;
struct hlist_node list_evictor; struct rcu_head rcu;
};
#define INETFRAGS_HASHSZ 1024
/* averaged:
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
* struct frag_queue))
*/
#define INETFRAGS_MAXDEPTH 128
struct inet_frag_bucket {
struct hlist_head chain;
spinlock_t chain_lock;
}; };
struct inet_frags { struct inet_frags {
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
struct work_struct frags_work;
unsigned int next_bucket;
unsigned long last_rebuild_jiffies;
bool rebuild;
/* The first call to hashfn is responsible to initialize
* rnd. This is best done with net_get_random_once.
*
* rnd_seqlock is used to let hash insertion detect
* when it needs to re-lookup the hash chain to use.
*/
u32 rnd;
seqlock_t rnd_seqlock;
int qsize; int qsize;
unsigned int (*hashfn)(const struct inet_frag_queue *);
bool (*match)(const struct inet_frag_queue *q,
const void *arg);
void (*constructor)(struct inet_frag_queue *q, void (*constructor)(struct inet_frag_queue *q,
const void *arg); const void *arg);
void (*destructor)(struct inet_frag_queue *); void (*destructor)(struct inet_frag_queue *);
@ -98,56 +98,47 @@ struct inet_frags {
void (*frag_expire)(unsigned long data); void (*frag_expire)(unsigned long data);
struct kmem_cache *frags_cachep; struct kmem_cache *frags_cachep;
const char *frags_cache_name; const char *frags_cache_name;
struct rhashtable_params rhash_params;
}; };
int inet_frags_init(struct inet_frags *); int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *); void inet_frags_fini(struct inet_frags *);
static inline void inet_frags_init_net(struct netns_frags *nf) static inline int inet_frags_init_net(struct netns_frags *nf)
{ {
atomic_set(&nf->mem, 0); atomic_long_set(&nf->mem, 0);
return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
} }
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); void inet_frags_exit_net(struct netns_frags *nf);
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
struct inet_frags *f, void *key, unsigned int hash);
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, /* Free all skbs in the queue; return the sum of their truesizes. */
const char *prefix); unsigned int inet_frag_rbtree_purge(struct rb_root *root);
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) static inline void inet_frag_put(struct inet_frag_queue *q)
{ {
if (atomic_dec_and_test(&q->refcnt)) if (atomic_dec_and_test(&q->refcnt))
inet_frag_destroy(q, f); inet_frag_destroy(q);
}
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
{
return !hlist_unhashed(&q->list_evictor);
} }
/* Memory Tracking Functions. */ /* Memory Tracking Functions. */
static inline int frag_mem_limit(struct netns_frags *nf) static inline long frag_mem_limit(const struct netns_frags *nf)
{ {
return atomic_read(&nf->mem); return atomic_long_read(&nf->mem);
} }
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
{ {
atomic_sub(i, &nf->mem); atomic_long_sub(val, &nf->mem);
} }
static inline void add_frag_mem_limit(struct netns_frags *nf, int i) static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
{ {
atomic_add(i, &nf->mem); atomic_long_add(val, &nf->mem);
}
static inline int sum_frag_mem_limit(struct netns_frags *nf)
{
return atomic_read(&nf->mem);
} }
/* RFC 3168 support : /* RFC 3168 support :

View file

@ -525,7 +525,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
return skb; return skb;
} }
#endif #endif
int ip_frag_mem(struct net *net);
/* /*
* Functions provided by ip_forward.c * Functions provided by ip_forward.c

View file

@ -320,13 +320,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
idev->cnf.accept_ra; idev->cnf.accept_ra;
} }
#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_frag_mem(struct net *net)
{
return sum_frag_mem_limit(&net->ipv6.frags);
}
#endif
#define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
#define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */
#define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */
@ -505,17 +498,8 @@ enum ip6_defrag_users {
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
}; };
struct ip6_create_arg {
__be32 id;
u32 user;
const struct in6_addr *src;
const struct in6_addr *dst;
int iif;
u8 ecn;
};
void ip6_frag_init(struct inet_frag_queue *q, const void *a); void ip6_frag_init(struct inet_frag_queue *q, const void *a);
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); extern const struct rhashtable_params ip6_rhash_params;
/* /*
* Equivalent of ipv4 struct ip * Equivalent of ipv4 struct ip
@ -523,19 +507,13 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
struct frag_queue { struct frag_queue {
struct inet_frag_queue q; struct inet_frag_queue q;
__be32 id; /* fragment id */
u32 user;
struct in6_addr saddr;
struct in6_addr daddr;
int iif; int iif;
unsigned int csum; unsigned int csum;
__u16 nhoffset; __u16 nhoffset;
u8 ecn; u8 ecn;
}; };
void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
struct inet_frags *frags);
static inline bool ipv6_addr_any(const struct in6_addr *a) static inline bool ipv6_addr_any(const struct in6_addr *a)
{ {

View file

@ -55,6 +55,7 @@ enum
IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */ IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */
IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */
IPSTATS_MIB_CEPKTS, /* InCEPkts */ IPSTATS_MIB_CEPKTS, /* InCEPkts */
IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */
__IPSTATS_MIB_MAX __IPSTATS_MIB_MAX
}; };

View file

@ -3817,7 +3817,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
continue; continue;
rdp = per_cpu_ptr(rsp->rda, cpu); rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu, pr_cont(" %d-%c%c%c", cpu,
"O."[cpu_online(cpu)], "O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)], "o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
} }

View file

@ -250,8 +250,10 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
if (!new_tbl) if (!new_tbl)
return 0; return 0;
for (old_hash = 0; old_hash < old_tbl->size; old_hash++) for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
rhashtable_rehash_chain(ht, old_hash); rhashtable_rehash_chain(ht, old_hash);
cond_resched();
}
/* Publish the new table pointer. */ /* Publish the new table pointer. */
rcu_assign_pointer(ht->tbl, new_tbl); rcu_assign_pointer(ht->tbl, new_tbl);
@ -441,7 +443,8 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
const void *key, const void *key,
struct rhash_head *obj, struct rhash_head *obj,
struct bucket_table *tbl) struct bucket_table *tbl,
void **data)
{ {
struct rhash_head *head; struct rhash_head *head;
unsigned int hash; unsigned int hash;
@ -452,8 +455,11 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
err = -EEXIST; err = -EEXIST;
if (key && rhashtable_lookup_fast(ht, key, ht->p)) if (key) {
goto exit; *data = rhashtable_lookup_fast(ht, key, ht->p);
if (*data)
goto exit;
}
err = -E2BIG; err = -E2BIG;
if (unlikely(rht_grow_above_max(ht, tbl))) if (unlikely(rht_grow_above_max(ht, tbl)))
@ -838,6 +844,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
for (i = 0; i < tbl->size; i++) { for (i = 0; i < tbl->size; i++) {
struct rhash_head *pos, *next; struct rhash_head *pos, *next;
cond_resched();
for (pos = rht_dereference(tbl->buckets[i], ht), for (pos = rht_dereference(tbl->buckets[i], ht),
next = !rht_is_a_nulls(pos) ? next = !rht_is_a_nulls(pos) ?
rht_dereference(pos->next, ht) : NULL; rht_dereference(pos->next, ht) : NULL;

View file

@ -1502,6 +1502,21 @@ done:
} }
EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(___pskb_trim);
/* Note : use pskb_trim_rcsum() instead of calling this directly
*/
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE) {
int delta = skb->len - len;
skb->csum = csum_block_sub(skb->csum,
skb_checksum(skb, len, delta, 0),
len);
}
return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);
/** /**
* __pskb_pull_tail - advance tail of skb header * __pskb_pull_tail - advance tail of skb header
* @skb: buffer to reallocate * @skb: buffer to reallocate
@ -2380,23 +2395,27 @@ EXPORT_SYMBOL(skb_queue_purge);
/** /**
* skb_rbtree_purge - empty a skb rbtree * skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty * @root: root of the rbtree to empty
* Return value: the sum of truesizes of all purged skbs.
* *
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take * the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP * any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock). * out-of-order queue is protected by the socket lock).
*/ */
void skb_rbtree_purge(struct rb_root *root) unsigned int skb_rbtree_purge(struct rb_root *root)
{ {
struct rb_node *p = rb_first(root); struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) { while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p); p = rb_next(p);
rb_erase(&skb->rbnode, root); rb_erase(&skb->rbnode, root);
sum += skb->truesize;
kfree_skb(skb); kfree_skb(skb);
} }
return sum;
} }
/** /**

View file

@ -16,37 +16,19 @@ typedef unsigned __bitwise__ lowpan_rx_result;
#define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAG1 0xc0
#define LOWPAN_DISPATCH_FRAGN 0xe0 #define LOWPAN_DISPATCH_FRAGN 0xe0
struct lowpan_create_arg { struct frag_lowpan_compare_key {
u16 tag; u16 tag;
u16 d_size; u16 d_size;
const struct ieee802154_addr *src; struct ieee802154_addr src;
const struct ieee802154_addr *dst; struct ieee802154_addr dst;
}; };
/* Equivalent of ipv4 struct ip /* Equivalent of ipv4 struct ipq
*/ */
struct lowpan_frag_queue { struct lowpan_frag_queue {
struct inet_frag_queue q; struct inet_frag_queue q;
u16 tag;
u16 d_size;
struct ieee802154_addr saddr;
struct ieee802154_addr daddr;
}; };
static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
{
switch (a->mode) {
case IEEE802154_ADDR_LONG:
return (((__force u64)a->extended_addr) >> 32) ^
(((__force u64)a->extended_addr) & 0xffffffff);
case IEEE802154_ADDR_SHORT:
return (__force u32)(a->short_addr);
default:
return 0;
}
}
/* private device info */ /* private device info */
struct lowpan_dev_info { struct lowpan_dev_info {
struct net_device *wdev; /* wpan device ptr */ struct net_device *wdev; /* wpan device ptr */

View file

@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
struct sk_buff *prev, struct net_device *ldev); struct sk_buff *prev, struct net_device *ldev);
static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
const struct ieee802154_addr *saddr,
const struct ieee802154_addr *daddr)
{
net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
return jhash_3words(ieee802154_addr_hash(saddr),
ieee802154_addr_hash(daddr),
(__force u32)(tag + (d_size << 16)),
lowpan_frags.rnd);
}
static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
{
const struct lowpan_frag_queue *fq;
fq = container_of(q, struct lowpan_frag_queue, q);
return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
}
static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct lowpan_frag_queue *fq;
const struct lowpan_create_arg *arg = a;
fq = container_of(q, struct lowpan_frag_queue, q);
return fq->tag == arg->tag && fq->d_size == arg->d_size &&
ieee802154_addr_equal(&fq->saddr, arg->src) &&
ieee802154_addr_equal(&fq->daddr, arg->dst);
}
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
{ {
const struct lowpan_create_arg *arg = a; const struct frag_lowpan_compare_key *key = a;
struct lowpan_frag_queue *fq; struct lowpan_frag_queue *fq;
fq = container_of(q, struct lowpan_frag_queue, q); fq = container_of(q, struct lowpan_frag_queue, q);
fq->tag = arg->tag; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
fq->d_size = arg->d_size; memcpy(&q->key, key, sizeof(*key));
fq->saddr = *arg->src;
fq->daddr = *arg->dst;
} }
static void lowpan_frag_expire(unsigned long data) static void lowpan_frag_expire(unsigned long data)
@ -93,10 +61,10 @@ static void lowpan_frag_expire(unsigned long data)
if (fq->q.flags & INET_FRAG_COMPLETE) if (fq->q.flags & INET_FRAG_COMPLETE)
goto out; goto out;
inet_frag_kill(&fq->q, &lowpan_frags); inet_frag_kill(&fq->q);
out: out:
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q, &lowpan_frags); inet_frag_put(&fq->q);
} }
static inline struct lowpan_frag_queue * static inline struct lowpan_frag_queue *
@ -104,25 +72,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
const struct ieee802154_addr *src, const struct ieee802154_addr *src,
const struct ieee802154_addr *dst) const struct ieee802154_addr *dst)
{ {
struct inet_frag_queue *q;
struct lowpan_create_arg arg;
unsigned int hash;
struct netns_ieee802154_lowpan *ieee802154_lowpan = struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
struct frag_lowpan_compare_key key = {};
struct inet_frag_queue *q;
arg.tag = cb->d_tag; key.tag = cb->d_tag;
arg.d_size = cb->d_size; key.d_size = cb->d_size;
arg.src = src; key.src = *src;
arg.dst = dst; key.dst = *dst;
hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); q = inet_frag_find(&ieee802154_lowpan->frags, &key);
if (!q)
q = inet_frag_find(&ieee802154_lowpan->frags,
&lowpan_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
}
return container_of(q, struct lowpan_frag_queue, q); return container_of(q, struct lowpan_frag_queue, q);
} }
@ -229,7 +192,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
struct sk_buff *fp, *head = fq->q.fragments; struct sk_buff *fp, *head = fq->q.fragments;
int sum_truesize; int sum_truesize;
inet_frag_kill(&fq->q, &lowpan_frags); inet_frag_kill(&fq->q);
/* Make the one we just received the head. */ /* Make the one we just received the head. */
if (prev) { if (prev) {
@ -408,7 +371,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
struct lowpan_frag_queue *fq; struct lowpan_frag_queue *fq;
struct net *net = dev_net(skb->dev); struct net *net = dev_net(skb->dev);
struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); struct lowpan_802154_cb *cb = lowpan_802154_cb(skb);
struct ieee802154_hdr hdr; struct ieee802154_hdr hdr = {};
int err; int err;
if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
@ -437,7 +400,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
ret = lowpan_frag_queue(fq, skb, frag_type); ret = lowpan_frag_queue(fq, skb, frag_type);
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q, &lowpan_frags); inet_frag_put(&fq->q);
return ret; return ret;
} }
@ -447,24 +410,22 @@ err:
} }
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero;
static struct ctl_table lowpan_frags_ns_ctl_table[] = { static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{ {
.procname = "6lowpanfrag_high_thresh", .procname = "6lowpanfrag_high_thresh",
.data = &init_net.ieee802154_lowpan.frags.high_thresh, .data = &init_net.ieee802154_lowpan.frags.high_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &init_net.ieee802154_lowpan.frags.low_thresh .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
}, },
{ {
.procname = "6lowpanfrag_low_thresh", .procname = "6lowpanfrag_low_thresh",
.data = &init_net.ieee802154_lowpan.frags.low_thresh, .data = &init_net.ieee802154_lowpan.frags.low_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero,
.extra2 = &init_net.ieee802154_lowpan.frags.high_thresh .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
}, },
{ {
@ -580,14 +541,20 @@ static int __net_init lowpan_frags_init_net(struct net *net)
{ {
struct netns_ieee802154_lowpan *ieee802154_lowpan = struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
int res;
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
ieee802154_lowpan->frags.f = &lowpan_frags;
inet_frags_init_net(&ieee802154_lowpan->frags); res = inet_frags_init_net(&ieee802154_lowpan->frags);
if (res < 0)
return lowpan_frags_ns_sysctl_register(net); return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
inet_frags_exit_net(&ieee802154_lowpan->frags);
return res;
} }
static void __net_exit lowpan_frags_exit_net(struct net *net) static void __net_exit lowpan_frags_exit_net(struct net *net)
@ -596,7 +563,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
net_ieee802154_lowpan(net); net_ieee802154_lowpan(net);
lowpan_frags_ns_sysctl_unregister(net); lowpan_frags_ns_sysctl_unregister(net);
inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); inet_frags_exit_net(&ieee802154_lowpan->frags);
} }
static struct pernet_operations lowpan_frags_ops = { static struct pernet_operations lowpan_frags_ops = {
@ -604,33 +571,64 @@ static struct pernet_operations lowpan_frags_ops = {
.exit = lowpan_frags_exit_net, .exit = lowpan_frags_exit_net,
}; };
static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
{
return jhash2(data,
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
}
static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
{
const struct inet_frag_queue *fq = data;
return jhash2((const u32 *)&fq->key,
sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
}
static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_lowpan_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
static const struct rhashtable_params lowpan_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.hashfn = lowpan_key_hashfn,
.obj_hashfn = lowpan_obj_hashfn,
.obj_cmpfn = lowpan_obj_cmpfn,
.automatic_shrinking = true,
};
int __init lowpan_net_frag_init(void) int __init lowpan_net_frag_init(void)
{ {
int ret; int ret;
ret = lowpan_frags_sysctl_register();
if (ret)
return ret;
ret = register_pernet_subsys(&lowpan_frags_ops);
if (ret)
goto err_pernet;
lowpan_frags.hashfn = lowpan_hashfn;
lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.constructor = lowpan_frag_init;
lowpan_frags.destructor = NULL; lowpan_frags.destructor = NULL;
lowpan_frags.skb_free = NULL; lowpan_frags.skb_free = NULL;
lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.qsize = sizeof(struct frag_queue);
lowpan_frags.match = lowpan_frag_match;
lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frag_expire = lowpan_frag_expire;
lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
lowpan_frags.rhash_params = lowpan_rhash_params;
ret = inet_frags_init(&lowpan_frags); ret = inet_frags_init(&lowpan_frags);
if (ret) if (ret)
goto err_pernet; goto out;
ret = lowpan_frags_sysctl_register();
if (ret)
goto err_sysctl;
ret = register_pernet_subsys(&lowpan_frags_ops);
if (ret)
goto err_pernet;
out:
return ret; return ret;
err_pernet: err_pernet:
lowpan_frags_sysctl_unregister(); lowpan_frags_sysctl_unregister();
err_sysctl:
inet_frags_fini(&lowpan_frags);
return ret; return ret;
} }

View file

@ -25,12 +25,6 @@
#include <net/inet_frag.h> #include <net/inet_frag.h>
#include <net/inet_ecn.h> #include <net/inet_ecn.h>
#define INETFRAGS_EVICT_BUCKETS 128
#define INETFRAGS_EVICT_MAX 512
/* don't rebuild inetfrag table with new secret more often than this */
#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped. * Value : 0xff if frame should be dropped.
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
}; };
EXPORT_SYMBOL(ip_frag_ecn_table); EXPORT_SYMBOL(ip_frag_ecn_table);
static unsigned int
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
{
return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
}
static bool inet_frag_may_rebuild(struct inet_frags *f)
{
return time_after(jiffies,
f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
}
static void inet_frag_secret_rebuild(struct inet_frags *f)
{
int i;
write_seqlock_bh(&f->rnd_seqlock);
if (!inet_frag_may_rebuild(f))
goto out;
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n;
hb = &f->hash[i];
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = inet_frag_hashfn(f, q);
if (hval != i) {
struct inet_frag_bucket *hb_dest;
hlist_del(&q->list);
/* Relink to new hash chain. */
hb_dest = &f->hash[hval];
/* This is the only place where we take
* another chain_lock while already holding
* one. As this will not run concurrently,
* we cannot deadlock on hb_dest lock below, if its
* already locked it will be released soon since
* other caller cannot be waiting for hb lock
* that we've taken above.
*/
spin_lock_nested(&hb_dest->chain_lock,
SINGLE_DEPTH_NESTING);
hlist_add_head(&q->list, &hb_dest->chain);
spin_unlock(&hb_dest->chain_lock);
}
}
spin_unlock(&hb->chain_lock);
}
f->rebuild = false;
f->last_rebuild_jiffies = jiffies;
out:
write_sequnlock_bh(&f->rnd_seqlock);
}
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
if (!hlist_unhashed(&q->list_evictor))
return false;
return q->net->low_thresh == 0 ||
frag_mem_limit(q->net) >= q->net->low_thresh;
}
static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
struct inet_frag_queue *fq;
struct hlist_node *n;
unsigned int evicted = 0;
HLIST_HEAD(expired);
spin_lock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))
continue;
if (!del_timer(&fq->timer))
continue;
hlist_add_head(&fq->list_evictor, &expired);
++evicted;
}
spin_unlock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq);
return evicted;
}
static void inet_frag_worker(struct work_struct *work)
{
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
unsigned int i, evicted = 0;
struct inet_frags *f;
f = container_of(work, struct inet_frags, frags_work);
BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
local_bh_disable();
for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
evicted += inet_evict_bucket(f, &f->hash[i]);
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
if (evicted > INETFRAGS_EVICT_MAX)
break;
}
f->next_bucket = i;
local_bh_enable();
if (f->rebuild && inet_frag_may_rebuild(f))
inet_frag_secret_rebuild(f);
}
static void inet_frag_schedule_worker(struct inet_frags *f)
{
if (unlikely(!work_pending(&f->frags_work)))
schedule_work(&f->frags_work);
}
int inet_frags_init(struct inet_frags *f) int inet_frags_init(struct inet_frags *f)
{ {
int i;
INIT_WORK(&f->frags_work, inet_frag_worker);
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb = &f->hash[i];
spin_lock_init(&hb->chain_lock);
INIT_HLIST_HEAD(&hb->chain);
}
seqlock_init(&f->rnd_seqlock);
f->last_rebuild_jiffies = 0;
f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
NULL); NULL);
if (!f->frags_cachep) if (!f->frags_cachep)
@ -214,73 +59,53 @@ EXPORT_SYMBOL(inet_frags_init);
void inet_frags_fini(struct inet_frags *f) void inet_frags_fini(struct inet_frags *f)
{ {
cancel_work_sync(&f->frags_work); /* We must wait that all inet_frag_destroy_rcu() have completed. */
rcu_barrier();
kmem_cache_destroy(f->frags_cachep); kmem_cache_destroy(f->frags_cachep);
f->frags_cachep = NULL;
} }
EXPORT_SYMBOL(inet_frags_fini); EXPORT_SYMBOL(inet_frags_fini);
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) static void inet_frags_free_cb(void *ptr, void *arg)
{ {
unsigned int seq; struct inet_frag_queue *fq = ptr;
int i;
nf->low_thresh = 0; /* If we can not cancel the timer, it means this frag_queue
* is already disappearing, we have nothing to do.
* Otherwise, we own a refcount until the end of this function.
*/
if (!del_timer(&fq->timer))
return;
evict_again: spin_lock_bh(&fq->lock);
local_bh_disable(); if (!(fq->flags & INET_FRAG_COMPLETE)) {
seq = read_seqbegin(&f->rnd_seqlock); fq->flags |= INET_FRAG_COMPLETE;
atomic_dec(&fq->refcnt);
}
spin_unlock_bh(&fq->lock);
for (i = 0; i < INETFRAGS_HASHSZ ; i++) inet_frag_put(fq);
inet_evict_bucket(f, &f->hash[i]); }
local_bh_enable(); void inet_frags_exit_net(struct netns_frags *nf)
cond_resched(); {
nf->high_thresh = 0; /* prevent creation of new frags */
if (read_seqretry(&f->rnd_seqlock, seq) || rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
sum_frag_mem_limit(nf))
goto evict_again;
} }
EXPORT_SYMBOL(inet_frags_exit_net); EXPORT_SYMBOL(inet_frags_exit_net);
static struct inet_frag_bucket * void inet_frag_kill(struct inet_frag_queue *fq)
get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
__acquires(hb->chain_lock)
{
struct inet_frag_bucket *hb;
unsigned int seq, hash;
restart:
seq = read_seqbegin(&f->rnd_seqlock);
hash = inet_frag_hashfn(f, fq);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
if (read_seqretry(&f->rnd_seqlock, seq)) {
spin_unlock(&hb->chain_lock);
goto restart;
}
return hb;
}
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
struct inet_frag_bucket *hb;
hb = get_frag_bucket_locked(fq, f);
hlist_del(&fq->list);
fq->flags |= INET_FRAG_COMPLETE;
spin_unlock(&hb->chain_lock);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{ {
if (del_timer(&fq->timer)) if (del_timer(&fq->timer))
atomic_dec(&fq->refcnt); atomic_dec(&fq->refcnt);
if (!(fq->flags & INET_FRAG_COMPLETE)) { if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f); struct netns_frags *nf = fq->net;
fq->flags |= INET_FRAG_COMPLETE;
rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
atomic_dec(&fq->refcnt); atomic_dec(&fq->refcnt);
} }
} }
@ -294,11 +119,23 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
kfree_skb(skb); kfree_skb(skb);
} }
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) static void inet_frag_destroy_rcu(struct rcu_head *head)
{
struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
rcu);
struct inet_frags *f = q->net->f;
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
}
void inet_frag_destroy(struct inet_frag_queue *q)
{ {
struct sk_buff *fp; struct sk_buff *fp;
struct netns_frags *nf; struct netns_frags *nf;
unsigned int sum, sum_truesize = 0; unsigned int sum, sum_truesize = 0;
struct inet_frags *f;
WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0); WARN_ON(del_timer(&q->timer) != 0);
@ -306,64 +143,35 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
/* Release all fragment data. */ /* Release all fragment data. */
fp = q->fragments; fp = q->fragments;
nf = q->net; nf = q->net;
while (fp) { f = nf->f;
struct sk_buff *xp = fp->next; if (fp) {
do {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize; sum_truesize += fp->truesize;
frag_kfree_skb(nf, f, fp); frag_kfree_skb(nf, f, fp);
fp = xp; fp = xp;
} while (fp);
} else {
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
} }
sum = sum_truesize + f->qsize; sum = sum_truesize + f->qsize;
if (f->destructor) call_rcu(&q->rcu, inet_frag_destroy_rcu);
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
sub_frag_mem_limit(nf, sum); sub_frag_mem_limit(nf, sum);
} }
EXPORT_SYMBOL(inet_frag_destroy); EXPORT_SYMBOL(inet_frag_destroy);
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in,
struct inet_frags *f,
void *arg)
{
struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could have been created on other cpu before
* we acquired hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
}
#endif
qp = qp_in;
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
spin_unlock(&hb->chain_lock);
return qp;
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f, struct inet_frags *f,
void *arg) void *arg)
{ {
struct inet_frag_queue *q; struct inet_frag_queue *q;
if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
return NULL;
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q) if (!q)
return NULL; return NULL;
@ -374,75 +182,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
setup_timer(&q->timer, f->frag_expire, (unsigned long)q); setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock); spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1); atomic_set(&q->refcnt, 3);
return q; return q;
} }
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f, void *arg,
void *arg) struct inet_frag_queue **prev)
{ {
struct inet_frags *f = nf->f;
struct inet_frag_queue *q; struct inet_frag_queue *q;
q = inet_frag_alloc(nf, f, arg); q = inet_frag_alloc(nf, f, arg);
if (!q) if (!q) {
*prev = ERR_PTR(-ENOMEM);
return NULL; return NULL;
}
mod_timer(&q->timer, jiffies + nf->timeout);
return inet_frag_intern(nf, q, f, arg); *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
&q->node, f->rhash_params);
if (*prev) {
q->flags |= INET_FRAG_COMPLETE;
inet_frag_kill(q);
inet_frag_destroy(q);
return NULL;
}
return q;
} }
EXPORT_SYMBOL(inet_frag_create);
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frags *f, void *key, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
unsigned int hash)
{ {
struct inet_frag_bucket *hb; struct inet_frag_queue *fq = NULL, *prev;
struct inet_frag_queue *q;
int depth = 0;
if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { rcu_read_lock();
inet_frag_schedule_worker(f); prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
return NULL; if (!prev)
fq = inet_frag_create(nf, key, &prev);
if (prev && !IS_ERR(prev)) {
fq = prev;
if (!atomic_inc_not_zero(&fq->refcnt))
fq = NULL;
} }
rcu_read_unlock();
if (frag_mem_limit(nf) > nf->low_thresh) return fq;
inet_frag_schedule_worker(f);
hash &= (INETFRAGS_HASHSZ - 1);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
spin_unlock(&hb->chain_lock);
return q;
}
depth++;
}
spin_unlock(&hb->chain_lock);
if (depth <= INETFRAGS_MAXDEPTH)
return inet_frag_create(nf, f, key);
if (inet_frag_may_rebuild(f)) {
if (!f->rebuild)
f->rebuild = true;
inet_frag_schedule_worker(f);
}
return ERR_PTR(-ENOBUFS);
} }
EXPORT_SYMBOL(inet_frag_find); EXPORT_SYMBOL(inet_frag_find);
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
const char *prefix)
{
static const char msg[] = "inet_frag_find: Fragment hash bucket"
" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
". Dropping fragment.\n";
if (PTR_ERR(q) == -ENOBUFS)
net_dbg_ratelimited("%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);

View file

@ -58,27 +58,64 @@
static int sysctl_ipfrag_max_dist __read_mostly = 64; static int sysctl_ipfrag_max_dist __read_mostly = 64;
static const char ip_frag_cache_name[] = "ip4-frags"; static const char ip_frag_cache_name[] = "ip4-frags";
struct ipfrag_skb_cb /* Use skb->cb to track consecutive/adjacent fragments coming at
{ * the end of the queue. Nodes in the rb-tree queue will
* contain "runs" of one or more adjacent fragments.
*
* Invariants:
* - next_frag is NULL at the tail of a "run";
* - the head of a "run" has the sum of all fragment lengths in frag_run_len.
*/
struct ipfrag_skb_cb {
struct inet_skb_parm h; struct inet_skb_parm h;
int offset; struct sk_buff *next_frag;
int frag_run_len;
}; };
#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
static void ip4_frag_init_run(struct sk_buff *skb)
{
BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
FRAG_CB(skb)->next_frag = NULL;
FRAG_CB(skb)->frag_run_len = skb->len;
}
/* Append skb to the last "run". */
static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
struct sk_buff *skb)
{
RB_CLEAR_NODE(&skb->rbnode);
FRAG_CB(skb)->next_frag = NULL;
FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
FRAG_CB(q->fragments_tail)->next_frag = skb;
q->fragments_tail = skb;
}
/* Create a new "run" with the skb. */
static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
{
if (q->last_run_head)
rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
&q->last_run_head->rbnode.rb_right);
else
rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
rb_insert_color(&skb->rbnode, &q->rb_fragments);
ip4_frag_init_run(skb);
q->fragments_tail = skb;
q->last_run_head = skb;
}
/* Describe an entry in the "incomplete datagrams" queue. */ /* Describe an entry in the "incomplete datagrams" queue. */
struct ipq { struct ipq {
struct inet_frag_queue q; struct inet_frag_queue q;
u32 user;
__be32 saddr;
__be32 daddr;
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */ u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */ u16 max_df_size; /* largest frag with DF set seen */
int iif; int iif;
int vif; /* L3 master device index */
unsigned int rid; unsigned int rid;
struct inet_peer *peer; struct inet_peer *peer;
}; };
@ -90,49 +127,9 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags; static struct inet_frags ip4_frags;
int ip_frag_mem(struct net *net) static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
{ struct sk_buff *prev_tail, struct net_device *dev);
return sum_frag_mem_limit(&net->ipv4.frags);
}
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev);
struct ip4_create_arg {
struct iphdr *iph;
u32 user;
int vif;
};
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
ip4_frags.rnd);
}
static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
{
const struct ipq *ipq;
ipq = container_of(q, struct ipq, q);
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct ipq *qp;
const struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
qp->user == arg->user &&
qp->vif == arg->vif;
}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a) static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{ {
@ -141,17 +138,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
frags); frags);
struct net *net = container_of(ipv4, struct net, ipv4); struct net *net = container_of(ipv4, struct net, ipv4);
const struct ip4_create_arg *arg = a; const struct frag_v4_compare_key *key = a;
qp->protocol = arg->iph->protocol; q->key.v4 = *key;
qp->id = arg->iph->id; qp->ecn = 0;
qp->ecn = ip4_frag_ecn(arg->iph->tos);
qp->saddr = arg->iph->saddr;
qp->daddr = arg->iph->daddr;
qp->vif = arg->vif;
qp->user = arg->user;
qp->peer = sysctl_ipfrag_max_dist ? qp->peer = sysctl_ipfrag_max_dist ?
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
NULL; NULL;
} }
@ -169,7 +161,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
static void ipq_put(struct ipq *ipq) static void ipq_put(struct ipq *ipq)
{ {
inet_frag_put(&ipq->q, &ip4_frags); inet_frag_put(&ipq->q);
} }
/* Kill ipq entry. It is not destroyed immediately, /* Kill ipq entry. It is not destroyed immediately,
@ -177,7 +169,7 @@ static void ipq_put(struct ipq *ipq)
*/ */
static void ipq_kill(struct ipq *ipq) static void ipq_kill(struct ipq *ipq)
{ {
inet_frag_kill(&ipq->q, &ip4_frags); inet_frag_kill(&ipq->q);
} }
static bool frag_expire_skip_icmp(u32 user) static bool frag_expire_skip_icmp(u32 user)
@ -194,8 +186,11 @@ static bool frag_expire_skip_icmp(u32 user)
*/ */
static void ip_expire(unsigned long arg) static void ip_expire(unsigned long arg)
{ {
struct ipq *qp; const struct iphdr *iph;
struct sk_buff *head = NULL;
struct net *net; struct net *net;
struct ipq *qp;
int err;
qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags); net = container_of(qp->q.net, struct net, ipv4.frags);
@ -208,51 +203,65 @@ static void ip_expire(unsigned long arg)
ipq_kill(qp); ipq_kill(qp);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
if (!inet_frag_evicting(&qp->q)) { if (!(qp->q.flags & INET_FRAG_FIRST_IN))
struct sk_buff *clone, *head = qp->q.fragments; goto out;
const struct iphdr *iph;
int err;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); /* sk_buff::dev and sk_buff::rbnode are unionized. So we
* pull the head out of the tree in order to be able to
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) * deal with head->dev.
*/
if (qp->q.fragments) {
head = qp->q.fragments;
qp->q.fragments = head->next;
} else {
head = skb_rb_first(&qp->q.rb_fragments);
if (!head)
goto out; goto out;
if (FRAG_CB(head)->next_frag)
head->dev = dev_get_by_index_rcu(net, qp->iif); rb_replace_node(&head->rbnode,
if (!head->dev) &FRAG_CB(head)->next_frag->rbnode,
goto out; &qp->q.rb_fragments);
else
rb_erase(&head->rbnode, &qp->q.rb_fragments);
/* skb has no dst, perform route lookup again */ memset(&head->rbnode, 0, sizeof(head->rbnode));
iph = ip_hdr(head); barrier();
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
if (err)
goto out;
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (frag_expire_skip_icmp(qp->user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
clone = skb_clone(head, GFP_ATOMIC);
/* Send an ICMP "Fragment Reassembly Timeout" message. */
if (clone) {
spin_unlock(&qp->q.lock);
icmp_send(clone, ICMP_TIME_EXCEEDED,
ICMP_EXC_FRAGTIME, 0);
consume_skb(clone);
goto out_rcu_unlock;
}
} }
if (head == qp->q.fragments_tail)
qp->q.fragments_tail = NULL;
sub_frag_mem_limit(qp->q.net, head->truesize);
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
/* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
if (err)
goto out;
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
goto out_rcu_unlock;
out: out:
spin_unlock(&qp->q.lock); spin_unlock(&qp->q.lock);
out_rcu_unlock: out_rcu_unlock:
rcu_read_unlock(); rcu_read_unlock();
if (head)
kfree_skb(head);
ipq_put(qp); ipq_put(qp);
} }
@ -262,21 +271,20 @@ out_rcu_unlock:
static struct ipq *ip_find(struct net *net, struct iphdr *iph, static struct ipq *ip_find(struct net *net, struct iphdr *iph,
u32 user, int vif) u32 user, int vif)
{ {
struct frag_v4_compare_key key = {
.saddr = iph->saddr,
.daddr = iph->daddr,
.user = user,
.vif = vif,
.id = iph->id,
.protocol = iph->protocol,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
arg.iph = iph; q = inet_frag_find(&net->ipv4.frags, &key);
arg.user = user; if (!q)
arg.vif = vif;
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
}
return container_of(q, struct ipq, q); return container_of(q, struct ipq, q);
} }
@ -296,7 +304,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid); end = atomic_inc_return(&peer->rid);
qp->rid = end; qp->rid = end;
rc = qp->q.fragments && (end - start) > max; rc = qp->q.fragments_tail && (end - start) > max;
if (rc) { if (rc) {
struct net *net; struct net *net;
@ -310,7 +318,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp)
{ {
struct sk_buff *fp;
unsigned int sum_truesize = 0; unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@ -318,21 +325,16 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT; return -ETIMEDOUT;
} }
fp = qp->q.fragments; sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
do {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
kfree_skb(fp);
fp = xp;
} while (fp);
sub_frag_mem_limit(qp->q.net, sum_truesize); sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0; qp->q.flags = 0;
qp->q.len = 0; qp->q.len = 0;
qp->q.meat = 0; qp->q.meat = 0;
qp->q.fragments = NULL; qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL; qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
qp->iif = 0; qp->iif = 0;
qp->ecn = 0; qp->ecn = 0;
@ -342,11 +344,13 @@ static int ip_frag_reinit(struct ipq *qp)
/* Add new segment to existing queue. */ /* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{ {
struct sk_buff *prev, *next; struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct rb_node **rbn, *parent;
struct sk_buff *skb1, *prev_tail;
int ihl, end, skb1_run_end;
struct net_device *dev; struct net_device *dev;
unsigned int fragsize; unsigned int fragsize;
int flags, offset; int flags, offset;
int ihl, end;
int err = -ENOENT; int err = -ENOENT;
u8 ecn; u8 ecn;
@ -405,94 +409,68 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (err) if (err)
goto err; goto err;
/* Find out which fragments are in front and at the back of us /* Note : skb->rbnode and skb->dev share the same location. */
* in the chain of fragments so far. We must know where to put
* this fragment, right?
*/
prev = qp->q.fragments_tail;
if (!prev || FRAG_CB(prev)->offset < offset) {
next = NULL;
goto found;
}
prev = NULL;
for (next = qp->q.fragments; next != NULL; next = next->next) {
if (FRAG_CB(next)->offset >= offset)
break; /* bingo! */
prev = next;
}
found:
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
*/
if (prev) {
int i = (FRAG_CB(prev)->offset + prev->len) - offset;
if (i > 0) {
offset += i;
err = -EINVAL;
if (end <= offset)
goto err;
err = -ENOMEM;
if (!pskb_pull(skb, i))
goto err;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
}
err = -ENOMEM;
while (next && FRAG_CB(next)->offset < end) {
int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
if (i < next->len) {
/* Eat head of the next overlapped fragment
* and leave the loop. The next ones cannot overlap.
*/
if (!pskb_pull(next, i))
goto err;
FRAG_CB(next)->offset += i;
qp->q.meat -= i;
if (next->ip_summed != CHECKSUM_UNNECESSARY)
next->ip_summed = CHECKSUM_NONE;
break;
} else {
struct sk_buff *free_it = next;
/* Old fragment is completely overridden with
* new one drop it.
*/
next = next->next;
if (prev)
prev->next = next;
else
qp->q.fragments = next;
qp->q.meat -= free_it->len;
sub_frag_mem_limit(qp->q.net, free_it->truesize);
kfree_skb(free_it);
}
}
FRAG_CB(skb)->offset = offset;
/* Insert this fragment in the chain of fragments. */
skb->next = next;
if (!next)
qp->q.fragments_tail = skb;
if (prev)
prev->next = skb;
else
qp->q.fragments = skb;
dev = skb->dev; dev = skb->dev;
if (dev) { /* Makes sure compiler wont do silly aliasing games */
qp->iif = dev->ifindex; barrier();
skb->dev = NULL;
/* RFC5722, Section 4, amended by Errata ID : 3089
* When reassembling an IPv6 datagram, if
* one or more its constituent fragments is determined to be an
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
* We do the same here for IPv4 (and increment an snmp counter) but
* we do not want to drop the whole queue in response to a duplicate
* fragment.
*/
err = -EINVAL;
/* Find out where to put this fragment. */
prev_tail = qp->q.fragments_tail;
if (!prev_tail)
ip4_frag_create_run(&qp->q, skb); /* First fragment. */
else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
/* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
goto discard_qp;
if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
ip4_frag_append_to_last_run(&qp->q, skb);
else
ip4_frag_create_run(&qp->q, skb);
} else {
/* Binary search. Note that skb can become the first fragment,
* but not the last (covered above).
*/
rbn = &qp->q.rb_fragments.rb_node;
do {
parent = *rbn;
skb1 = rb_to_skb(parent);
skb1_run_end = skb1->ip_defrag_offset +
FRAG_CB(skb1)->frag_run_len;
if (end <= skb1->ip_defrag_offset)
rbn = &parent->rb_left;
else if (offset >= skb1_run_end)
rbn = &parent->rb_right;
else if (offset >= skb1->ip_defrag_offset &&
end <= skb1_run_end)
goto err; /* No new data, potential duplicate */
else
goto discard_qp; /* Found an overlap */
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb.
*/
ip4_frag_init_run(skb);
rb_link_node(&skb->rbnode, parent, rbn);
rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
} }
if (dev)
qp->iif = dev->ifindex;
skb->ip_defrag_offset = offset;
qp->q.stamp = skb->tstamp; qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len; qp->q.meat += skb->len;
qp->ecn |= ecn; qp->ecn |= ecn;
@ -514,7 +492,7 @@ found:
unsigned long orefdst = skb->_skb_refdst; unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL; skb->_skb_refdst = 0UL;
err = ip_frag_reasm(qp, prev, dev); err = ip_frag_reasm(qp, skb, prev_tail, dev);
skb->_skb_refdst = orefdst; skb->_skb_refdst = orefdst;
return err; return err;
} }
@ -522,20 +500,23 @@ found:
skb_dst_drop(skb); skb_dst_drop(skb);
return -EINPROGRESS; return -EINPROGRESS;
discard_qp:
inet_frag_kill(&qp->q);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
err: err:
kfree_skb(skb); kfree_skb(skb);
return err; return err;
} }
/* Build a new IP datagram from all its fragments. */ /* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct sk_buff *prev_tail, struct net_device *dev)
struct net_device *dev)
{ {
struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph; struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments; struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
struct sk_buff **nextp; /* To build frag_list. */
struct rb_node *rbn;
int len; int len;
int ihlen; int ihlen;
int err; int err;
@ -549,26 +530,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail; goto out_fail;
} }
/* Make the one we just received the head. */ /* Make the one we just received the head. */
if (prev) { if (head != skb) {
head = prev->next; fp = skb_clone(skb, GFP_ATOMIC);
fp = skb_clone(head, GFP_ATOMIC);
if (!fp) if (!fp)
goto out_nomem; goto out_nomem;
FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
fp->next = head->next; if (RB_EMPTY_NODE(&skb->rbnode))
if (!fp->next) FRAG_CB(prev_tail)->next_frag = fp;
else
rb_replace_node(&skb->rbnode, &fp->rbnode,
&qp->q.rb_fragments);
if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp; qp->q.fragments_tail = fp;
prev->next = fp; skb_morph(skb, head);
FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
skb_morph(head, qp->q.fragments); rb_replace_node(&head->rbnode, &skb->rbnode,
head->next = qp->q.fragments->next; &qp->q.rb_fragments);
consume_skb(head);
consume_skb(qp->q.fragments); head = skb;
qp->q.fragments = head;
} }
WARN_ON(!head); WARN_ON(head->ip_defrag_offset != 0);
WARN_ON(FRAG_CB(head)->offset != 0);
/* Allocate a new buffer for the datagram. */ /* Allocate a new buffer for the datagram. */
ihlen = ip_hdrlen(head); ihlen = ip_hdrlen(head);
@ -592,35 +574,61 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC); clone = alloc_skb(0, GFP_ATOMIC);
if (!clone) if (!clone)
goto out_nomem; goto out_nomem;
clone->next = head->next;
head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head); skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++) for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]); plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen; clone->len = clone->data_len = head->data_len - plen;
head->data_len -= clone->len; head->truesize += clone->truesize;
head->len -= clone->len;
clone->csum = 0; clone->csum = 0;
clone->ip_summed = head->ip_summed; clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize); add_frag_mem_limit(qp->q.net, clone->truesize);
skb_shinfo(head)->frag_list = clone;
nextp = &clone->next;
} else {
nextp = &skb_shinfo(head)->frag_list;
} }
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head)); skb_push(head, head->data - skb_network_header(head));
for (fp=head->next; fp; fp = fp->next) { /* Traverse the tree in order, to build frag_list. */
head->data_len += fp->len; fp = FRAG_CB(head)->next_frag;
head->len += fp->len; rbn = rb_next(&head->rbnode);
if (head->ip_summed != fp->ip_summed) rb_erase(&head->rbnode, &qp->q.rb_fragments);
head->ip_summed = CHECKSUM_NONE; while (rbn || fp) {
else if (head->ip_summed == CHECKSUM_COMPLETE) /* fp points to the next sk_buff in the current run;
head->csum = csum_add(head->csum, fp->csum); * rbn points to the next run.
head->truesize += fp->truesize; */
/* Go through the current run. */
while (fp) {
*nextp = fp;
nextp = &fp->next;
fp->prev = NULL;
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
fp->sk = NULL;
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
fp = FRAG_CB(fp)->next_frag;
}
/* Move to the next run. */
if (rbn) {
struct rb_node *rbnext = rb_next(rbn);
fp = rb_to_skb(rbn);
rb_erase(rbn, &qp->q.rb_fragments);
rbn = rbnext;
}
} }
sub_frag_mem_limit(qp->q.net, head->truesize); sub_frag_mem_limit(qp->q.net, head->truesize);
*nextp = NULL;
head->next = NULL; head->next = NULL;
head->prev = NULL;
head->dev = dev; head->dev = dev;
head->tstamp = qp->q.stamp; head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@ -648,7 +656,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL; qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL; qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
return 0; return 0;
out_nomem: out_nomem:
@ -656,7 +666,7 @@ out_nomem:
err = -ENOMEM; err = -ENOMEM;
goto out_fail; goto out_fail;
out_oversize: out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
out_fail: out_fail:
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err; return err;
@ -734,25 +744,46 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
} }
EXPORT_SYMBOL(ip_check_defrag); EXPORT_SYMBOL(ip_check_defrag);
unsigned int inet_frag_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
while (skb) {
struct sk_buff *next = FRAG_CB(skb)->next_frag;
sum += skb->truesize;
kfree_skb(skb);
skb = next;
}
}
return sum;
}
EXPORT_SYMBOL(inet_frag_rbtree_purge);
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero; static int dist_min;
static struct ctl_table ip4_frags_ns_ctl_table[] = { static struct ctl_table ip4_frags_ns_ctl_table[] = {
{ {
.procname = "ipfrag_high_thresh", .procname = "ipfrag_high_thresh",
.data = &init_net.ipv4.frags.high_thresh, .data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &init_net.ipv4.frags.low_thresh .extra1 = &init_net.ipv4.frags.low_thresh
}, },
{ {
.procname = "ipfrag_low_thresh", .procname = "ipfrag_low_thresh",
.data = &init_net.ipv4.frags.low_thresh, .data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero,
.extra2 = &init_net.ipv4.frags.high_thresh .extra2 = &init_net.ipv4.frags.high_thresh
}, },
{ {
@ -781,7 +812,7 @@ static struct ctl_table ip4_frags_ctl_table[] = {
.maxlen = sizeof(int), .maxlen = sizeof(int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_dointvec_minmax,
.extra1 = &zero .extra1 = &dist_min,
}, },
{ } { }
}; };
@ -853,6 +884,8 @@ static void __init ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net) static int __net_init ipv4_frags_init_net(struct net *net)
{ {
int res;
/* Fragment cache limits. /* Fragment cache limits.
* *
* The fragment memory accounting code, (tries to) account for * The fragment memory accounting code, (tries to) account for
@ -876,15 +909,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
*/ */
net->ipv4.frags.timeout = IP_FRAG_TIME; net->ipv4.frags.timeout = IP_FRAG_TIME;
inet_frags_init_net(&net->ipv4.frags); net->ipv4.frags.f = &ip4_frags;
return ip4_frags_ns_ctl_register(net); res = inet_frags_init_net(&net->ipv4.frags);
if (res < 0)
return res;
res = ip4_frags_ns_ctl_register(net);
if (res < 0)
inet_frags_exit_net(&net->ipv4.frags);
return res;
} }
static void __net_exit ipv4_frags_exit_net(struct net *net) static void __net_exit ipv4_frags_exit_net(struct net *net)
{ {
ip4_frags_ns_ctl_unregister(net); ip4_frags_ns_ctl_unregister(net);
inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); inet_frags_exit_net(&net->ipv4.frags);
} }
static struct pernet_operations ip4_frags_ops = { static struct pernet_operations ip4_frags_ops = {
@ -892,18 +931,50 @@ static struct pernet_operations ip4_frags_ops = {
.exit = ipv4_frags_exit_net, .exit = ipv4_frags_exit_net,
}; };
static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
{
return jhash2(data,
sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}
static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
{
const struct inet_frag_queue *fq = data;
return jhash2((const u32 *)&fq->key.v4,
sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
}
static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_v4_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
static const struct rhashtable_params ip4_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.key_offset = offsetof(struct inet_frag_queue, key),
.key_len = sizeof(struct frag_v4_compare_key),
.hashfn = ip4_key_hashfn,
.obj_hashfn = ip4_obj_hashfn,
.obj_cmpfn = ip4_obj_cmpfn,
.automatic_shrinking = true,
};
void __init ipfrag_init(void) void __init ipfrag_init(void)
{ {
ip4_frags_ctl_register();
register_pernet_subsys(&ip4_frags_ops);
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init; ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free; ip4_frags.destructor = ip4_frag_free;
ip4_frags.skb_free = NULL; ip4_frags.skb_free = NULL;
ip4_frags.qsize = sizeof(struct ipq); ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire; ip4_frags.frag_expire = ip_expire;
ip4_frags.frags_cache_name = ip_frag_cache_name; ip4_frags.frags_cache_name = ip_frag_cache_name;
ip4_frags.rhash_params = ip4_rhash_params;
if (inet_frags_init(&ip4_frags)) if (inet_frags_init(&ip4_frags))
panic("IP: failed to allocate ip4_frags cache\n"); panic("IP: failed to allocate ip4_frags cache\n");
ip4_frags_ctl_register();
register_pernet_subsys(&ip4_frags_ops);
} }

View file

@ -52,7 +52,6 @@
static int sockstat_seq_show(struct seq_file *seq, void *v) static int sockstat_seq_show(struct seq_file *seq, void *v)
{ {
struct net *net = seq->private; struct net *net = seq->private;
unsigned int frag_mem;
int orphans, sockets; int orphans, sockets;
local_bh_disable(); local_bh_disable();
@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udplite_prot)); sock_prot_inuse_get(net, &udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot)); sock_prot_inuse_get(net, &raw_prot));
frag_mem = ip_frag_mem(net); seq_printf(seq, "FRAG: inuse %u memory %lu\n",
seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); atomic_read(&net->ipv4.frags.rhashtable.nelems),
frag_mem_limit(&net->ipv4.frags));
return 0; return 0;
} }
@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };

View file

@ -64,7 +64,6 @@ struct nf_ct_frag6_skb_cb
static struct inet_frags nf_frags; static struct inet_frags nf_frags;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero;
static struct ctl_table nf_ct_frag6_sysctl_table[] = { static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{ {
@ -77,18 +76,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{ {
.procname = "nf_conntrack_frag6_low_thresh", .procname = "nf_conntrack_frag6_low_thresh",
.data = &init_net.nf_frag.frags.low_thresh, .data = &init_net.nf_frag.frags.low_thresh,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero,
.extra2 = &init_net.nf_frag.frags.high_thresh .extra2 = &init_net.nf_frag.frags.high_thresh
}, },
{ {
.procname = "nf_conntrack_frag6_high_thresh", .procname = "nf_conntrack_frag6_high_thresh",
.data = &init_net.nf_frag.frags.high_thresh, .data = &init_net.nf_frag.frags.high_thresh,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &init_net.nf_frag.frags.low_thresh .extra1 = &init_net.nf_frag.frags.low_thresh
}, },
{ } { }
@ -153,23 +151,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
} }
static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
(__force u32)id, nf_frags.rnd);
}
static unsigned int nf_hashfn(const struct inet_frag_queue *q)
{
const struct frag_queue *nq;
nq = container_of(q, struct frag_queue, q);
return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
}
static void nf_skb_free(struct sk_buff *skb) static void nf_skb_free(struct sk_buff *skb)
{ {
if (NFCT_FRAG6_CB(skb)->orig) if (NFCT_FRAG6_CB(skb)->orig)
@ -184,34 +165,26 @@ static void nf_ct_frag6_expire(unsigned long data)
fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
net = container_of(fq->q.net, struct net, nf_frag.frags); net = container_of(fq->q.net, struct net, nf_frag.frags);
ip6_expire_frag_queue(net, fq, &nf_frags); ip6_expire_frag_queue(net, fq);
} }
/* Creation primitives. */ /* Creation primitives. */
static inline struct frag_queue *fq_find(struct net *net, __be32 id, static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
u32 user, struct in6_addr *src, const struct ipv6hdr *hdr, int iif)
struct in6_addr *dst, int iif, u8 ecn)
{ {
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
.daddr = hdr->daddr,
.user = user,
.iif = iif,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip6_create_arg arg;
unsigned int hash;
arg.id = id; q = inet_frag_find(&net->nf_frag.frags, &key);
arg.user = user; if (!q)
arg.src = src;
arg.dst = dst;
arg.iif = iif;
arg.ecn = ecn;
local_bh_disable();
hash = nf_hash_frag(id, src, dst);
q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
local_bh_enable();
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
}
return container_of(q, struct frag_queue, q); return container_of(q, struct frag_queue, q);
} }
@ -362,7 +335,7 @@ found:
return 0; return 0;
discard_fq: discard_fq:
inet_frag_kill(&fq->q, &nf_frags); inet_frag_kill(&fq->q);
err: err:
return -1; return -1;
} }
@ -383,7 +356,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
int payload_len; int payload_len;
u8 ecn; u8 ecn;
inet_frag_kill(&fq->q, &nf_frags); inet_frag_kill(&fq->q);
WARN_ON(head == NULL); WARN_ON(head == NULL);
WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
@ -454,6 +427,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
else if (head->ip_summed == CHECKSUM_COMPLETE) else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum); head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize; head->truesize += fp->truesize;
fp->sk = NULL;
} }
sub_frag_mem_limit(fq->q.net, head->truesize); sub_frag_mem_limit(fq->q.net, head->truesize);
@ -472,6 +446,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
head->csum); head->csum);
fq->q.fragments = NULL; fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL; fq->q.fragments_tail = NULL;
/* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
@ -601,9 +576,13 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
hdr = ipv6_hdr(clone); hdr = ipv6_hdr(clone);
fhdr = (struct frag_hdr *)skb_transport_header(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone);
if (clone->len - skb_network_offset(clone) < IPV6_MIN_MTU &&
fhdr->frag_off & htons(IP6_MF))
goto ret_orig;
skb_orphan(skb); skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) { if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n"); pr_debug("Can't find and can't create new queue\n");
goto ret_orig; goto ret_orig;
@ -614,7 +593,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
spin_unlock_bh(&fq->q.lock); spin_unlock_bh(&fq->q.lock);
pr_debug("Can't insert skb to queue\n"); pr_debug("Can't insert skb to queue\n");
inet_frag_put(&fq->q, &nf_frags); inet_frag_put(&fq->q);
goto ret_orig; goto ret_orig;
} }
@ -626,7 +605,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
} }
spin_unlock_bh(&fq->q.lock); spin_unlock_bh(&fq->q.lock);
inet_frag_put(&fq->q, &nf_frags); inet_frag_put(&fq->q);
return ret_skb; return ret_skb;
ret_orig: ret_orig:
@ -650,18 +629,26 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig);
static int nf_ct_net_init(struct net *net) static int nf_ct_net_init(struct net *net)
{ {
int res;
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
inet_frags_init_net(&net->nf_frag.frags); net->nf_frag.frags.f = &nf_frags;
return nf_ct_frag6_sysctl_register(net); res = inet_frags_init_net(&net->nf_frag.frags);
if (res < 0)
return res;
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
inet_frags_exit_net(&net->nf_frag.frags);
return res;
} }
static void nf_ct_net_exit(struct net *net) static void nf_ct_net_exit(struct net *net)
{ {
nf_ct_frags6_sysctl_unregister(net); nf_ct_frags6_sysctl_unregister(net);
inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); inet_frags_exit_net(&net->nf_frag.frags);
} }
static struct pernet_operations nf_ct_net_ops = { static struct pernet_operations nf_ct_net_ops = {
@ -673,14 +660,13 @@ int nf_ct_frag6_init(void)
{ {
int ret = 0; int ret = 0;
nf_frags.hashfn = nf_hashfn;
nf_frags.constructor = ip6_frag_init; nf_frags.constructor = ip6_frag_init;
nf_frags.destructor = NULL; nf_frags.destructor = NULL;
nf_frags.skb_free = nf_skb_free; nf_frags.skb_free = nf_skb_free;
nf_frags.qsize = sizeof(struct frag_queue); nf_frags.qsize = sizeof(struct frag_queue);
nf_frags.match = ip6_frag_match;
nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frag_expire = nf_ct_frag6_expire;
nf_frags.frags_cache_name = nf_frags_cache_name; nf_frags.frags_cache_name = nf_frags_cache_name;
nf_frags.rhash_params = ip6_rhash_params;
ret = inet_frags_init(&nf_frags); ret = inet_frags_init(&nf_frags);
if (ret) if (ret)
goto out; goto out;

View file

@ -33,7 +33,6 @@
static int sockstat6_seq_show(struct seq_file *seq, void *v) static int sockstat6_seq_show(struct seq_file *seq, void *v)
{ {
struct net *net = seq->private; struct net *net = seq->private;
unsigned int frag_mem = ip6_frag_mem(net);
seq_printf(seq, "TCP6: inuse %d\n", seq_printf(seq, "TCP6: inuse %d\n",
sock_prot_inuse_get(net, &tcpv6_prot)); sock_prot_inuse_get(net, &tcpv6_prot));
@ -43,7 +42,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udplitev6_prot)); sock_prot_inuse_get(net, &udplitev6_prot));
seq_printf(seq, "RAW6: inuse %d\n", seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot)); sock_prot_inuse_get(net, &rawv6_prot));
seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
atomic_read(&net->ipv6.frags.rhashtable.nelems),
frag_mem_limit(&net->ipv6.frags));
return 0; return 0;
} }

View file

@ -79,94 +79,58 @@ static struct inet_frags ip6_frags;
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
struct net_device *dev); struct net_device *dev);
/*
* callers should be careful not to use the hash value outside the ipfrag_lock
* as doing so could race with ipfrag_hash_rnd being recalculated.
*/
static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
(__force u32)id, ip6_frags.rnd);
}
static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
{
const struct frag_queue *fq;
fq = container_of(q, struct frag_queue, q);
return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
}
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
{
const struct frag_queue *fq;
const struct ip6_create_arg *arg = a;
fq = container_of(q, struct frag_queue, q);
return fq->id == arg->id &&
fq->user == arg->user &&
ipv6_addr_equal(&fq->saddr, arg->src) &&
ipv6_addr_equal(&fq->daddr, arg->dst) &&
(arg->iif == fq->iif ||
!(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
IPV6_ADDR_LINKLOCAL)));
}
EXPORT_SYMBOL(ip6_frag_match);
void ip6_frag_init(struct inet_frag_queue *q, const void *a) void ip6_frag_init(struct inet_frag_queue *q, const void *a)
{ {
struct frag_queue *fq = container_of(q, struct frag_queue, q); struct frag_queue *fq = container_of(q, struct frag_queue, q);
const struct ip6_create_arg *arg = a; const struct frag_v6_compare_key *key = a;
fq->id = arg->id; q->key.v6 = *key;
fq->user = arg->user; fq->ecn = 0;
fq->saddr = *arg->src;
fq->daddr = *arg->dst;
fq->ecn = arg->ecn;
} }
EXPORT_SYMBOL(ip6_frag_init); EXPORT_SYMBOL(ip6_frag_init);
void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
struct inet_frags *frags)
{ {
struct net_device *dev = NULL; struct net_device *dev = NULL;
struct sk_buff *head;
rcu_read_lock();
spin_lock(&fq->q.lock); spin_lock(&fq->q.lock);
if (fq->q.flags & INET_FRAG_COMPLETE) if (fq->q.flags & INET_FRAG_COMPLETE)
goto out; goto out;
inet_frag_kill(&fq->q, frags); inet_frag_kill(&fq->q);
rcu_read_lock();
dev = dev_get_by_index_rcu(net, fq->iif); dev = dev_get_by_index_rcu(net, fq->iif);
if (!dev) if (!dev)
goto out_rcu_unlock; goto out;
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
if (inet_frag_evicting(&fq->q))
goto out_rcu_unlock;
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
/* Don't send error if the first segment did not arrive. */ /* Don't send error if the first segment did not arrive. */
if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) head = fq->q.fragments;
goto out_rcu_unlock; if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
goto out;
/* But use as source device on which LAST ARRIVED /* But use as source device on which LAST ARRIVED
* segment was received. And do not use fq->dev * segment was received. And do not use fq->dev
* pointer directly, device might already disappeared. * pointer directly, device might already disappeared.
*/ */
fq->q.fragments->dev = dev; head->dev = dev;
icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); skb_get(head);
out_rcu_unlock: spin_unlock(&fq->q.lock);
rcu_read_unlock();
icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
kfree_skb(head);
goto out_rcu_unlock;
out: out:
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q, frags); out_rcu_unlock:
rcu_read_unlock();
inet_frag_put(&fq->q);
} }
EXPORT_SYMBOL(ip6_expire_frag_queue); EXPORT_SYMBOL(ip6_expire_frag_queue);
@ -178,31 +142,29 @@ static void ip6_frag_expire(unsigned long data)
fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
net = container_of(fq->q.net, struct net, ipv6.frags); net = container_of(fq->q.net, struct net, ipv6.frags);
ip6_expire_frag_queue(net, fq, &ip6_frags); ip6_expire_frag_queue(net, fq);
} }
static struct frag_queue * static struct frag_queue *
fq_find(struct net *net, __be32 id, const struct in6_addr *src, fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
const struct in6_addr *dst, int iif, u8 ecn)
{ {
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
.daddr = hdr->daddr,
.user = IP6_DEFRAG_LOCAL_DELIVER,
.iif = iif,
};
struct inet_frag_queue *q; struct inet_frag_queue *q;
struct ip6_create_arg arg;
unsigned int hash;
arg.id = id; if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
arg.user = IP6_DEFRAG_LOCAL_DELIVER; IPV6_ADDR_LINKLOCAL)))
arg.src = src; key.iif = 0;
arg.dst = dst;
arg.iif = iif;
arg.ecn = ecn;
hash = inet6_hash_frag(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &key);
if (!q)
q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL; return NULL;
}
return container_of(q, struct frag_queue, q); return container_of(q, struct frag_queue, q);
} }
@ -359,7 +321,7 @@ found:
return -1; return -1;
discard_fq: discard_fq:
inet_frag_kill(&fq->q, &ip6_frags); inet_frag_kill(&fq->q);
err: err:
IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASMFAILS); IPSTATS_MIB_REASMFAILS);
@ -386,7 +348,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
int sum_truesize; int sum_truesize;
u8 ecn; u8 ecn;
inet_frag_kill(&fq->q, &ip6_frags); inet_frag_kill(&fq->q);
ecn = ip_frag_ecn_table[fq->ecn]; ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff)) if (unlikely(ecn == 0xff))
@ -503,6 +465,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock(); rcu_read_unlock();
fq->q.fragments = NULL; fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL; fq->q.fragments_tail = NULL;
return 1; return 1;
@ -524,6 +487,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
struct frag_queue *fq; struct frag_queue *fq;
const struct ipv6hdr *hdr = ipv6_hdr(skb); const struct ipv6hdr *hdr = ipv6_hdr(skb);
struct net *net = dev_net(skb_dst(skb)->dev); struct net *net = dev_net(skb_dst(skb)->dev);
int iif;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
goto fail_hdr; goto fail_hdr;
@ -552,17 +516,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return 1; return 1;
} }
fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); fhdr->frag_off & htons(IP6_MF))
goto fail_hdr;
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) { if (fq) {
int ret; int ret;
spin_lock(&fq->q.lock); spin_lock(&fq->q.lock);
fq->iif = iif;
ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
spin_unlock(&fq->q.lock); spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q, &ip6_frags); inet_frag_put(&fq->q);
return ret; return ret;
} }
@ -583,24 +552,22 @@ static const struct inet6_protocol frag_protocol = {
}; };
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero;
static struct ctl_table ip6_frags_ns_ctl_table[] = { static struct ctl_table ip6_frags_ns_ctl_table[] = {
{ {
.procname = "ip6frag_high_thresh", .procname = "ip6frag_high_thresh",
.data = &init_net.ipv6.frags.high_thresh, .data = &init_net.ipv6.frags.high_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &init_net.ipv6.frags.low_thresh .extra1 = &init_net.ipv6.frags.low_thresh
}, },
{ {
.procname = "ip6frag_low_thresh", .procname = "ip6frag_low_thresh",
.data = &init_net.ipv6.frags.low_thresh, .data = &init_net.ipv6.frags.low_thresh,
.maxlen = sizeof(int), .maxlen = sizeof(unsigned long),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero,
.extra2 = &init_net.ipv6.frags.high_thresh .extra2 = &init_net.ipv6.frags.high_thresh
}, },
{ {
@ -708,19 +675,27 @@ static void ip6_frags_sysctl_unregister(void)
static int __net_init ipv6_frags_init_net(struct net *net) static int __net_init ipv6_frags_init_net(struct net *net)
{ {
int res;
net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
net->ipv6.frags.f = &ip6_frags;
inet_frags_init_net(&net->ipv6.frags); res = inet_frags_init_net(&net->ipv6.frags);
if (res < 0)
return res;
return ip6_frags_ns_sysctl_register(net); res = ip6_frags_ns_sysctl_register(net);
if (res < 0)
inet_frags_exit_net(&net->ipv6.frags);
return res;
} }
static void __net_exit ipv6_frags_exit_net(struct net *net) static void __net_exit ipv6_frags_exit_net(struct net *net)
{ {
ip6_frags_ns_sysctl_unregister(net); ip6_frags_ns_sysctl_unregister(net);
inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); inet_frags_exit_net(&net->ipv6.frags);
} }
static struct pernet_operations ip6_frags_ops = { static struct pernet_operations ip6_frags_ops = {
@ -728,14 +703,55 @@ static struct pernet_operations ip6_frags_ops = {
.exit = ipv6_frags_exit_net, .exit = ipv6_frags_exit_net,
}; };
static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
{
return jhash2(data,
sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}
static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
{
const struct inet_frag_queue *fq = data;
return jhash2((const u32 *)&fq->key.v6,
sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}
static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
const struct frag_v6_compare_key *key = arg->key;
const struct inet_frag_queue *fq = ptr;
return !!memcmp(&fq->key, key, sizeof(*key));
}
const struct rhashtable_params ip6_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
.hashfn = ip6_key_hashfn,
.obj_hashfn = ip6_obj_hashfn,
.obj_cmpfn = ip6_obj_cmpfn,
.automatic_shrinking = true,
};
EXPORT_SYMBOL(ip6_rhash_params);
int __init ipv6_frag_init(void) int __init ipv6_frag_init(void)
{ {
int ret; int ret;
ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL;
ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.frag_expire = ip6_frag_expire;
ip6_frags.frags_cache_name = ip6_frag_cache_name;
ip6_frags.rhash_params = ip6_rhash_params;
ret = inet_frags_init(&ip6_frags);
if (ret) if (ret)
goto out; goto out;
ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
if (ret)
goto err_protocol;
ret = ip6_frags_sysctl_register(); ret = ip6_frags_sysctl_register();
if (ret) if (ret)
goto err_sysctl; goto err_sysctl;
@ -744,17 +760,6 @@ int __init ipv6_frag_init(void)
if (ret) if (ret)
goto err_pernet; goto err_pernet;
ip6_frags.hashfn = ip6_hashfn;
ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL;
ip6_frags.skb_free = NULL;
ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.match = ip6_frag_match;
ip6_frags.frag_expire = ip6_frag_expire;
ip6_frags.frags_cache_name = ip6_frag_cache_name;
ret = inet_frags_init(&ip6_frags);
if (ret)
goto err_pernet;
out: out:
return ret; return ret;
@ -762,6 +767,8 @@ err_pernet:
ip6_frags_sysctl_unregister(); ip6_frags_sysctl_unregister();
err_sysctl: err_sysctl:
inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
err_protocol:
inet_frags_fini(&ip6_frags);
goto out; goto out;
} }