netfilter: conntrack: remove central spinlock nf_conntrack_lock
nf_conntrack_lock is a monolithic lock and suffers from huge contention on current generation servers (8 or more core/threads). Perf locking congestion is clear on base kernel: - 72.56% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock_bh - _raw_spin_lock_bh + 25.33% init_conntrack + 24.86% nf_ct_delete_from_lists + 24.62% __nf_conntrack_confirm + 24.38% destroy_conntrack + 0.70% tcp_packet + 2.21% ksoftirqd/6 [kernel.kallsyms] [k] fib_table_lookup + 1.15% ksoftirqd/6 [kernel.kallsyms] [k] __slab_free + 0.77% ksoftirqd/6 [kernel.kallsyms] [k] inet_getpeer + 0.70% ksoftirqd/6 [nf_conntrack] [k] nf_ct_delete + 0.55% ksoftirqd/6 [ip_tables] [k] ipt_do_table This patch change conntrack locking and provides a huge performance improvement. SYN-flood attack tested on a 24-core E5-2695v2(ES) with 10Gbit/s ixgbe (with tool trafgen): Base kernel: 810.405 new conntrack/sec After patch: 2.233.876 new conntrack/sec Notice other floods attack (SYN+ACK or ACK) can easily be deflected using: # iptables -A INPUT -m state --state INVALID -j DROP # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0 Use an array of hashed spinlocks to protect insertions/deletions of conntracks into the hash table. 1024 spinlocks seem to give good results, at minimal cost (4KB memory). Due to lockdep max depth, 1024 becomes 8 if CONFIG_LOCKDEP=y The hash resize is a bit tricky, because we need to take all locks in the array. A seqcount_t is used to synchronize the hash table users with the resizing process. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
This commit is contained in:
Родитель
ca7433df3a
Коммит
93bb0ceb75
|
@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
|
||||||
const struct nf_conntrack_l3proto *l3proto,
|
const struct nf_conntrack_l3proto *l3proto,
|
||||||
const struct nf_conntrack_l4proto *proto);
|
const struct nf_conntrack_l4proto *proto);
|
||||||
|
|
||||||
extern spinlock_t nf_conntrack_lock ;
|
#ifdef CONFIG_LOCKDEP
|
||||||
|
# define CONNTRACK_LOCKS 8
|
||||||
|
#else
|
||||||
|
# define CONNTRACK_LOCKS 1024
|
||||||
|
#endif
|
||||||
|
extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
|
||||||
|
|
||||||
extern spinlock_t nf_conntrack_expect_lock;
|
extern spinlock_t nf_conntrack_expect_lock;
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include <linux/list_nulls.h>
|
#include <linux/list_nulls.h>
|
||||||
#include <linux/atomic.h>
|
#include <linux/atomic.h>
|
||||||
#include <linux/netfilter/nf_conntrack_tcp.h>
|
#include <linux/netfilter/nf_conntrack_tcp.h>
|
||||||
|
#include <linux/seqlock.h>
|
||||||
|
|
||||||
struct ctl_table_header;
|
struct ctl_table_header;
|
||||||
struct nf_conntrack_ecache;
|
struct nf_conntrack_ecache;
|
||||||
|
@ -90,6 +91,7 @@ struct netns_ct {
|
||||||
int sysctl_checksum;
|
int sysctl_checksum;
|
||||||
|
|
||||||
unsigned int htable_size;
|
unsigned int htable_size;
|
||||||
|
seqcount_t generation;
|
||||||
struct kmem_cache *nf_conntrack_cachep;
|
struct kmem_cache *nf_conntrack_cachep;
|
||||||
struct hlist_nulls_head *hash;
|
struct hlist_nulls_head *hash;
|
||||||
struct hlist_head *expect_hash;
|
struct hlist_head *expect_hash;
|
||||||
|
|
|
@ -60,12 +60,60 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
|
||||||
const struct nlattr *attr) __read_mostly;
|
const struct nlattr *attr) __read_mostly;
|
||||||
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
|
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
|
||||||
|
|
||||||
DEFINE_SPINLOCK(nf_conntrack_lock);
|
__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
|
||||||
EXPORT_SYMBOL_GPL(nf_conntrack_lock);
|
EXPORT_SYMBOL_GPL(nf_conntrack_locks);
|
||||||
|
|
||||||
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
|
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
|
||||||
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
|
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
|
||||||
|
|
||||||
|
static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
|
||||||
|
{
|
||||||
|
h1 %= CONNTRACK_LOCKS;
|
||||||
|
h2 %= CONNTRACK_LOCKS;
|
||||||
|
spin_unlock(&nf_conntrack_locks[h1]);
|
||||||
|
if (h1 != h2)
|
||||||
|
spin_unlock(&nf_conntrack_locks[h2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* return true if we need to recompute hashes (in case hash table was resized) */
|
||||||
|
static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
|
||||||
|
unsigned int h2, unsigned int sequence)
|
||||||
|
{
|
||||||
|
h1 %= CONNTRACK_LOCKS;
|
||||||
|
h2 %= CONNTRACK_LOCKS;
|
||||||
|
if (h1 <= h2) {
|
||||||
|
spin_lock(&nf_conntrack_locks[h1]);
|
||||||
|
if (h1 != h2)
|
||||||
|
spin_lock_nested(&nf_conntrack_locks[h2],
|
||||||
|
SINGLE_DEPTH_NESTING);
|
||||||
|
} else {
|
||||||
|
spin_lock(&nf_conntrack_locks[h2]);
|
||||||
|
spin_lock_nested(&nf_conntrack_locks[h1],
|
||||||
|
SINGLE_DEPTH_NESTING);
|
||||||
|
}
|
||||||
|
if (read_seqcount_retry(&net->ct.generation, sequence)) {
|
||||||
|
nf_conntrack_double_unlock(h1, h2);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void nf_conntrack_all_lock(void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < CONNTRACK_LOCKS; i++)
|
||||||
|
spin_lock_nested(&nf_conntrack_locks[i], i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void nf_conntrack_all_unlock(void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < CONNTRACK_LOCKS; i++)
|
||||||
|
spin_unlock(&nf_conntrack_locks[i]);
|
||||||
|
}
|
||||||
|
|
||||||
unsigned int nf_conntrack_htable_size __read_mostly;
|
unsigned int nf_conntrack_htable_size __read_mostly;
|
||||||
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
|
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
|
||||||
|
|
||||||
|
@ -280,15 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
|
||||||
static void nf_ct_delete_from_lists(struct nf_conn *ct)
|
static void nf_ct_delete_from_lists(struct nf_conn *ct)
|
||||||
{
|
{
|
||||||
struct net *net = nf_ct_net(ct);
|
struct net *net = nf_ct_net(ct);
|
||||||
|
unsigned int hash, reply_hash;
|
||||||
|
u16 zone = nf_ct_zone(ct);
|
||||||
|
unsigned int sequence;
|
||||||
|
|
||||||
nf_ct_helper_destroy(ct);
|
nf_ct_helper_destroy(ct);
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
|
||||||
/* Inside lock so preempt is disabled on module removal path.
|
local_bh_disable();
|
||||||
* Otherwise we can get spurious warnings. */
|
do {
|
||||||
NF_CT_STAT_INC(net, delete_list);
|
sequence = read_seqcount_begin(&net->ct.generation);
|
||||||
|
hash = hash_conntrack(net, zone,
|
||||||
|
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
||||||
|
reply_hash = hash_conntrack(net, zone,
|
||||||
|
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
||||||
|
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
|
||||||
|
|
||||||
clean_from_lists(ct);
|
clean_from_lists(ct);
|
||||||
|
nf_conntrack_double_unlock(hash, reply_hash);
|
||||||
|
|
||||||
nf_ct_add_to_dying_list(ct);
|
nf_ct_add_to_dying_list(ct);
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
|
||||||
|
NF_CT_STAT_INC(net, delete_list);
|
||||||
|
local_bh_enable();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void death_by_event(unsigned long ul_conntrack)
|
static void death_by_event(unsigned long ul_conntrack)
|
||||||
|
@ -372,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
|
||||||
* Warning :
|
* Warning :
|
||||||
* - Caller must take a reference on returned object
|
* - Caller must take a reference on returned object
|
||||||
* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
|
* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
|
||||||
* OR
|
|
||||||
* - Caller must lock nf_conntrack_lock before calling this function
|
|
||||||
*/
|
*/
|
||||||
static struct nf_conntrack_tuple_hash *
|
static struct nf_conntrack_tuple_hash *
|
||||||
____nf_conntrack_find(struct net *net, u16 zone,
|
____nf_conntrack_find(struct net *net, u16 zone,
|
||||||
|
@ -467,14 +526,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
|
||||||
struct nf_conntrack_tuple_hash *h;
|
struct nf_conntrack_tuple_hash *h;
|
||||||
struct hlist_nulls_node *n;
|
struct hlist_nulls_node *n;
|
||||||
u16 zone;
|
u16 zone;
|
||||||
|
unsigned int sequence;
|
||||||
|
|
||||||
zone = nf_ct_zone(ct);
|
zone = nf_ct_zone(ct);
|
||||||
hash = hash_conntrack(net, zone,
|
|
||||||
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
|
||||||
reply_hash = hash_conntrack(net, zone,
|
|
||||||
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
|
||||||
|
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
local_bh_disable();
|
||||||
|
do {
|
||||||
|
sequence = read_seqcount_begin(&net->ct.generation);
|
||||||
|
hash = hash_conntrack(net, zone,
|
||||||
|
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
||||||
|
reply_hash = hash_conntrack(net, zone,
|
||||||
|
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
||||||
|
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
|
||||||
|
|
||||||
/* See if there's one in the list already, including reverse */
|
/* See if there's one in the list already, including reverse */
|
||||||
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
|
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
|
||||||
|
@ -493,14 +556,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
|
||||||
/* The caller holds a reference to this object */
|
/* The caller holds a reference to this object */
|
||||||
atomic_set(&ct->ct_general.use, 2);
|
atomic_set(&ct->ct_general.use, 2);
|
||||||
__nf_conntrack_hash_insert(ct, hash, reply_hash);
|
__nf_conntrack_hash_insert(ct, hash, reply_hash);
|
||||||
|
nf_conntrack_double_unlock(hash, reply_hash);
|
||||||
NF_CT_STAT_INC(net, insert);
|
NF_CT_STAT_INC(net, insert);
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
local_bh_enable();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
nf_conntrack_double_unlock(hash, reply_hash);
|
||||||
NF_CT_STAT_INC(net, insert_failed);
|
NF_CT_STAT_INC(net, insert_failed);
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
local_bh_enable();
|
||||||
return -EEXIST;
|
return -EEXIST;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
|
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
|
||||||
|
@ -540,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
||||||
enum ip_conntrack_info ctinfo;
|
enum ip_conntrack_info ctinfo;
|
||||||
struct net *net;
|
struct net *net;
|
||||||
u16 zone;
|
u16 zone;
|
||||||
|
unsigned int sequence;
|
||||||
|
|
||||||
ct = nf_ct_get(skb, &ctinfo);
|
ct = nf_ct_get(skb, &ctinfo);
|
||||||
net = nf_ct_net(ct);
|
net = nf_ct_net(ct);
|
||||||
|
@ -552,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
||||||
return NF_ACCEPT;
|
return NF_ACCEPT;
|
||||||
|
|
||||||
zone = nf_ct_zone(ct);
|
zone = nf_ct_zone(ct);
|
||||||
/* reuse the hash saved before */
|
local_bh_disable();
|
||||||
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
|
|
||||||
hash = hash_bucket(hash, net);
|
do {
|
||||||
reply_hash = hash_conntrack(net, zone,
|
sequence = read_seqcount_begin(&net->ct.generation);
|
||||||
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
/* reuse the hash saved before */
|
||||||
|
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
|
||||||
|
hash = hash_bucket(hash, net);
|
||||||
|
reply_hash = hash_conntrack(net, zone,
|
||||||
|
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
||||||
|
|
||||||
|
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
|
||||||
|
|
||||||
/* We're not in hash table, and we refuse to set up related
|
/* We're not in hash table, and we refuse to set up related
|
||||||
connections for unconfirmed conns. But packet copies and
|
* connections for unconfirmed conns. But packet copies and
|
||||||
REJECT will give spurious warnings here. */
|
* REJECT will give spurious warnings here.
|
||||||
|
*/
|
||||||
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
|
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
|
||||||
|
|
||||||
/* No external references means no one else could have
|
/* No external references means no one else could have
|
||||||
confirmed us. */
|
* confirmed us.
|
||||||
|
*/
|
||||||
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
|
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
|
||||||
pr_debug("Confirming conntrack %p\n", ct);
|
pr_debug("Confirming conntrack %p\n", ct);
|
||||||
|
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
|
||||||
|
|
||||||
/* We have to check the DYING flag inside the lock to prevent
|
/* We have to check the DYING flag inside the lock to prevent
|
||||||
a race against nf_ct_get_next_corpse() possibly called from
|
a race against nf_ct_get_next_corpse() possibly called from
|
||||||
user context, else we insert an already 'dead' hash, blocking
|
user context, else we insert an already 'dead' hash, blocking
|
||||||
further use of that particular connection -JM */
|
further use of that particular connection -JM */
|
||||||
|
|
||||||
if (unlikely(nf_ct_is_dying(ct))) {
|
if (unlikely(nf_ct_is_dying(ct))) {
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
nf_conntrack_double_unlock(hash, reply_hash);
|
||||||
|
local_bh_enable();
|
||||||
return NF_ACCEPT;
|
return NF_ACCEPT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -618,8 +689,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
||||||
* stores are visible.
|
* stores are visible.
|
||||||
*/
|
*/
|
||||||
__nf_conntrack_hash_insert(ct, hash, reply_hash);
|
__nf_conntrack_hash_insert(ct, hash, reply_hash);
|
||||||
|
nf_conntrack_double_unlock(hash, reply_hash);
|
||||||
NF_CT_STAT_INC(net, insert);
|
NF_CT_STAT_INC(net, insert);
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
local_bh_enable();
|
||||||
|
|
||||||
help = nfct_help(ct);
|
help = nfct_help(ct);
|
||||||
if (help && help->helper)
|
if (help && help->helper)
|
||||||
|
@ -630,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
||||||
return NF_ACCEPT;
|
return NF_ACCEPT;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
nf_conntrack_double_unlock(hash, reply_hash);
|
||||||
NF_CT_STAT_INC(net, insert_failed);
|
NF_CT_STAT_INC(net, insert_failed);
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
local_bh_enable();
|
||||||
return NF_DROP;
|
return NF_DROP;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
|
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
|
||||||
|
@ -674,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
|
||||||
|
|
||||||
/* There's a small race here where we may free a just-assured
|
/* There's a small race here where we may free a just-assured
|
||||||
connection. Too bad: we're in trouble anyway. */
|
connection. Too bad: we're in trouble anyway. */
|
||||||
static noinline int early_drop(struct net *net, unsigned int hash)
|
static noinline int early_drop(struct net *net, unsigned int _hash)
|
||||||
{
|
{
|
||||||
/* Use oldest entry, which is roughly LRU */
|
/* Use oldest entry, which is roughly LRU */
|
||||||
struct nf_conntrack_tuple_hash *h;
|
struct nf_conntrack_tuple_hash *h;
|
||||||
struct nf_conn *ct = NULL, *tmp;
|
struct nf_conn *ct = NULL, *tmp;
|
||||||
struct hlist_nulls_node *n;
|
struct hlist_nulls_node *n;
|
||||||
unsigned int i, cnt = 0;
|
unsigned int i = 0, cnt = 0;
|
||||||
int dropped = 0;
|
int dropped = 0;
|
||||||
|
unsigned int hash, sequence;
|
||||||
|
spinlock_t *lockp;
|
||||||
|
|
||||||
rcu_read_lock();
|
local_bh_disable();
|
||||||
for (i = 0; i < net->ct.htable_size; i++) {
|
restart:
|
||||||
|
sequence = read_seqcount_begin(&net->ct.generation);
|
||||||
|
hash = hash_bucket(_hash, net);
|
||||||
|
for (; i < net->ct.htable_size; i++) {
|
||||||
|
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
|
||||||
|
spin_lock(lockp);
|
||||||
|
if (read_seqcount_retry(&net->ct.generation, sequence)) {
|
||||||
|
spin_unlock(lockp);
|
||||||
|
goto restart;
|
||||||
|
}
|
||||||
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
|
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
|
||||||
hnnode) {
|
hnnode) {
|
||||||
tmp = nf_ct_tuplehash_to_ctrack(h);
|
tmp = nf_ct_tuplehash_to_ctrack(h);
|
||||||
if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
|
if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
|
||||||
|
!nf_ct_is_dying(tmp) &&
|
||||||
|
atomic_inc_not_zero(&tmp->ct_general.use)) {
|
||||||
ct = tmp;
|
ct = tmp;
|
||||||
|
break;
|
||||||
|
}
|
||||||
cnt++;
|
cnt++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ct != NULL) {
|
hash = (hash + 1) % net->ct.htable_size;
|
||||||
if (likely(!nf_ct_is_dying(ct) &&
|
spin_unlock(lockp);
|
||||||
atomic_inc_not_zero(&ct->ct_general.use)))
|
|
||||||
break;
|
|
||||||
else
|
|
||||||
ct = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cnt >= NF_CT_EVICTION_RANGE)
|
if (ct || cnt >= NF_CT_EVICTION_RANGE)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
hash = (hash + 1) % net->ct.htable_size;
|
|
||||||
}
|
}
|
||||||
rcu_read_unlock();
|
local_bh_enable();
|
||||||
|
|
||||||
if (!ct)
|
if (!ct)
|
||||||
return dropped;
|
return dropped;
|
||||||
|
@ -755,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
|
||||||
|
|
||||||
if (nf_conntrack_max &&
|
if (nf_conntrack_max &&
|
||||||
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
|
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
|
||||||
if (!early_drop(net, hash_bucket(hash, net))) {
|
if (!early_drop(net, hash)) {
|
||||||
atomic_dec(&net->ct.count);
|
atomic_dec(&net->ct.count);
|
||||||
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
|
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
|
@ -1304,18 +1386,24 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
|
||||||
struct nf_conn *ct;
|
struct nf_conn *ct;
|
||||||
struct hlist_nulls_node *n;
|
struct hlist_nulls_node *n;
|
||||||
int cpu;
|
int cpu;
|
||||||
|
spinlock_t *lockp;
|
||||||
|
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
|
||||||
for (; *bucket < net->ct.htable_size; (*bucket)++) {
|
for (; *bucket < net->ct.htable_size; (*bucket)++) {
|
||||||
hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
|
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
|
||||||
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
|
local_bh_disable();
|
||||||
continue;
|
spin_lock(lockp);
|
||||||
ct = nf_ct_tuplehash_to_ctrack(h);
|
if (*bucket < net->ct.htable_size) {
|
||||||
if (iter(ct, data))
|
hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
|
||||||
goto found;
|
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
|
||||||
|
continue;
|
||||||
|
ct = nf_ct_tuplehash_to_ctrack(h);
|
||||||
|
if (iter(ct, data))
|
||||||
|
goto found;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
spin_unlock(lockp);
|
||||||
|
local_bh_enable();
|
||||||
}
|
}
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
|
||||||
|
|
||||||
for_each_possible_cpu(cpu) {
|
for_each_possible_cpu(cpu) {
|
||||||
struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
|
struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
|
||||||
|
@ -1331,7 +1419,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
|
||||||
return NULL;
|
return NULL;
|
||||||
found:
|
found:
|
||||||
atomic_inc(&ct->ct_general.use);
|
atomic_inc(&ct->ct_general.use);
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
spin_unlock(lockp);
|
||||||
|
local_bh_enable();
|
||||||
return ct;
|
return ct;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1532,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
|
||||||
if (!hash)
|
if (!hash)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
local_bh_disable();
|
||||||
|
nf_conntrack_all_lock();
|
||||||
|
write_seqcount_begin(&init_net.ct.generation);
|
||||||
|
|
||||||
/* Lookups in the old hash might happen in parallel, which means we
|
/* Lookups in the old hash might happen in parallel, which means we
|
||||||
* might get false negatives during connection lookup. New connections
|
* might get false negatives during connection lookup. New connections
|
||||||
* created because of a false negative won't make it into the hash
|
* created because of a false negative won't make it into the hash
|
||||||
* though since that required taking the lock.
|
* though since that required taking the locks.
|
||||||
*/
|
*/
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
|
||||||
for (i = 0; i < init_net.ct.htable_size; i++) {
|
for (i = 0; i < init_net.ct.htable_size; i++) {
|
||||||
while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
|
while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
|
||||||
h = hlist_nulls_entry(init_net.ct.hash[i].first,
|
h = hlist_nulls_entry(init_net.ct.hash[i].first,
|
||||||
|
@ -1554,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
|
||||||
|
|
||||||
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
|
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
|
||||||
init_net.ct.hash = hash;
|
init_net.ct.hash = hash;
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
|
||||||
|
write_seqcount_end(&init_net.ct.generation);
|
||||||
|
nf_conntrack_all_unlock();
|
||||||
|
local_bh_enable();
|
||||||
|
|
||||||
nf_ct_free_hashtable(old_hash, old_size);
|
nf_ct_free_hashtable(old_hash, old_size);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1576,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
|
||||||
int nf_conntrack_init_start(void)
|
int nf_conntrack_init_start(void)
|
||||||
{
|
{
|
||||||
int max_factor = 8;
|
int max_factor = 8;
|
||||||
int ret, cpu;
|
int i, ret, cpu;
|
||||||
|
|
||||||
|
for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++)
|
||||||
|
spin_lock_init(&nf_conntrack_locks[i]);
|
||||||
|
|
||||||
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
|
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
|
||||||
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
|
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
|
||||||
|
|
|
@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
|
||||||
unhelp(h, me);
|
unhelp(h, me);
|
||||||
spin_unlock_bh(&pcpu->lock);
|
spin_unlock_bh(&pcpu->lock);
|
||||||
}
|
}
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
local_bh_disable();
|
||||||
for (i = 0; i < net->ct.htable_size; i++) {
|
for (i = 0; i < net->ct.htable_size; i++) {
|
||||||
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
|
spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
|
||||||
unhelp(h, me);
|
if (i < net->ct.htable_size) {
|
||||||
|
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
|
||||||
|
unhelp(h, me);
|
||||||
|
}
|
||||||
|
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
|
||||||
}
|
}
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
local_bh_enable();
|
||||||
}
|
}
|
||||||
|
|
||||||
void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
|
void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
|
||||||
|
|
|
@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
|
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
|
||||||
u_int8_t l3proto = nfmsg->nfgen_family;
|
u_int8_t l3proto = nfmsg->nfgen_family;
|
||||||
int res;
|
int res;
|
||||||
|
spinlock_t *lockp;
|
||||||
|
|
||||||
#ifdef CONFIG_NF_CONNTRACK_MARK
|
#ifdef CONFIG_NF_CONNTRACK_MARK
|
||||||
const struct ctnetlink_dump_filter *filter = cb->data;
|
const struct ctnetlink_dump_filter *filter = cb->data;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
spin_lock_bh(&nf_conntrack_lock);
|
|
||||||
last = (struct nf_conn *)cb->args[1];
|
last = (struct nf_conn *)cb->args[1];
|
||||||
|
|
||||||
|
local_bh_disable();
|
||||||
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
|
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
|
||||||
restart:
|
restart:
|
||||||
|
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
|
||||||
|
spin_lock(lockp);
|
||||||
|
if (cb->args[0] >= net->ct.htable_size) {
|
||||||
|
spin_unlock(lockp);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
|
hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
|
||||||
hnnode) {
|
hnnode) {
|
||||||
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
|
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
|
||||||
|
@ -803,16 +812,18 @@ restart:
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
nf_conntrack_get(&ct->ct_general);
|
nf_conntrack_get(&ct->ct_general);
|
||||||
cb->args[1] = (unsigned long)ct;
|
cb->args[1] = (unsigned long)ct;
|
||||||
|
spin_unlock(lockp);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
spin_unlock(lockp);
|
||||||
if (cb->args[1]) {
|
if (cb->args[1]) {
|
||||||
cb->args[1] = 0;
|
cb->args[1] = 0;
|
||||||
goto restart;
|
goto restart;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
spin_unlock_bh(&nf_conntrack_lock);
|
local_bh_enable();
|
||||||
if (last)
|
if (last)
|
||||||
nf_ct_put(last);
|
nf_ct_put(last);
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче