2005-04-17 02:20:36 +04:00
|
|
|
/* flow.c: Generic flow cache.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
|
|
|
|
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/jhash.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/random.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/bitops.h>
|
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpumask.h>
|
2006-03-21 09:33:17 +03:00
|
|
|
#include <linux/mutex.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <net/flow.h>
|
2011-07-27 03:09:06 +04:00
|
|
|
#include <linux/atomic.h>
|
[LSM-IPSec]: Security association restriction.
This patch series implements per packet access control via the
extension of the Linux Security Modules (LSM) interface by hooks in
the XFRM and pfkey subsystems that leverage IPSec security
associations to label packets. Extensions to the SELinux LSM are
included that leverage the patch for this purpose.
This patch implements the changes necessary to the XFRM subsystem,
pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a
socket to use only authorized security associations (or no security
association) to send/receive network packets.
Patch purpose:
The patch is designed to enable access control per packets based on
the strongly authenticated IPSec security association. Such access
controls augment the existing ones based on network interface and IP
address. The former are very coarse-grained, and the latter can be
spoofed. By using IPSec, the system can control access to remote
hosts based on cryptographic keys generated using the IPSec mechanism.
This enables access control on a per-machine basis or per-application
if the remote machine is running the same mechanism and trusted to
enforce the access control policy.
Patch design approach:
The overall approach is that policy (xfrm_policy) entries set by
user-level programs (e.g., setkey for ipsec-tools) are extended with a
security context that is used at policy selection time in the XFRM
subsystem to restrict the sockets that can send/receive packets via
security associations (xfrm_states) that are built from those
policies.
A presentation available at
www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf
from the SELinux symposium describes the overall approach.
Patch implementation details:
On output, the policy retrieved (via xfrm_policy_lookup or
xfrm_sk_policy_lookup) must be authorized for the security context of
the socket and the same security context is required for resultant
security association (retrieved or negotiated via racoon in
ipsec-tools). This is enforced in xfrm_state_find.
On input, the policy retrieved must also be authorized for the socket
(at __xfrm_policy_check), and the security context of the policy must
also match the security association being used.
The patch has virtually no impact on packets that do not use IPSec.
The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as
before.
Also, if IPSec is used without security contexts, the impact is
minimal. The LSM must allow such policies to be selected for the
combination of socket and remote machine, but subsequent IPSec
processing proceeds as in the original case.
Testing:
The pfkey interface is tested using the ipsec-tools. ipsec-tools have
been modified (a separate ipsec-tools patch is available for version
0.5) that supports assignment of xfrm_policy entries and security
associations with security contexts via setkey and the negotiation
using the security contexts via racoon.
The xfrm_user interface is tested via ad hoc programs that set
security contexts. These programs are also available from me, and
contain programs for setting, getting, and deleting policy for testing
this interface. Testing of sa functions was done by tracing kernel
behavior.
Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-12-14 10:12:27 +03:00
|
|
|
#include <linux/security.h>
|
2014-01-18 05:55:27 +04:00
|
|
|
#include <net/net_namespace.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
struct flow_cache_entry {
|
2010-04-07 04:30:07 +04:00
|
|
|
union {
|
|
|
|
struct hlist_node hlist;
|
|
|
|
struct list_head gc_list;
|
|
|
|
} u;
|
2011-08-31 10:05:27 +04:00
|
|
|
struct net *net;
|
2010-04-07 04:30:04 +04:00
|
|
|
u16 family;
|
|
|
|
u8 dir;
|
|
|
|
u32 genid;
|
|
|
|
struct flowi key;
|
|
|
|
struct flow_cache_object *object;
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
struct flow_flush_info {
|
2010-04-07 04:30:04 +04:00
|
|
|
struct flow_cache *cache;
|
2010-03-31 04:17:06 +04:00
|
|
|
atomic_t cpuleft;
|
|
|
|
struct completion completion;
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
2014-03-10 18:09:07 +04:00
|
|
|
static struct kmem_cache *flow_cachep __read_mostly;
|
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
|
|
|
|
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
static void flow_cache_new_hashrnd(unsigned long arg)
|
|
|
|
{
|
2010-03-31 04:17:06 +04:00
|
|
|
struct flow_cache *fc = (void *) arg;
|
2005-04-17 02:20:36 +04:00
|
|
|
int i;
|
|
|
|
|
2006-04-11 09:52:50 +04:00
|
|
|
for_each_possible_cpu(i)
|
2010-03-31 04:17:06 +04:00
|
|
|
per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
|
|
|
|
add_timer(&fc->rnd_timer);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
static int flow_entry_valid(struct flow_cache_entry *fle,
|
|
|
|
struct netns_xfrm *xfrm)
|
2010-04-07 04:30:04 +04:00
|
|
|
{
|
2014-01-18 05:55:27 +04:00
|
|
|
if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
|
2010-04-07 04:30:04 +04:00
|
|
|
return 0;
|
|
|
|
if (fle->object && !fle->object->ops->check(fle->object))
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
static void flow_entry_kill(struct flow_cache_entry *fle,
|
|
|
|
struct netns_xfrm *xfrm)
|
IPsec: propagate security module errors up from flow_cache_lookup
When a security module is loaded (in this case, SELinux), the
security_xfrm_policy_lookup() hook can return an access denied permission
(or other error). We were not handling that correctly, and in fact
inverting the return logic and propagating a false "ok" back up to
xfrm_lookup(), which then allowed packets to pass as if they were not
associated with an xfrm policy.
The way I was seeing the problem was when connecting via IPsec to a
confined service on an SELinux box (vsftpd), which did not have the
appropriate SELinux policy permissions to send packets via IPsec.
The first SYNACK would be blocked, because of an uncached lookup via
flow_cache_lookup(), which would fail to resolve an xfrm policy because
the SELinux policy is checked at that point via the resolver.
However, retransmitted SYNACKs would then find a cached flow entry when
calling into flow_cache_lookup() with a null xfrm policy, which is
interpreted by xfrm_lookup() as the packet not having any associated
policy and similarly to the first case, allowing it to pass without
transformation.
The solution presented here is to first ensure that errno values are
correctly propagated all the way back up through the various call chains
from security_xfrm_policy_lookup(), and handled correctly.
Then, flow_cache_lookup() is modified, so that if the policy resolver
fails (typically a permission denied via the security module), the flow
cache entry is killed rather than having a null policy assigned (which
indicates that the packet can pass freely). This also forces any future
lookups for the same flow to consult the security module (e.g. SELinux)
for current security policy (rather than, say, caching the error on the
flow cache entry).
Signed-off-by: James Morris <jmorris@namei.org>
2006-10-06 00:42:27 +04:00
|
|
|
{
|
|
|
|
if (fle->object)
|
2010-04-07 04:30:04 +04:00
|
|
|
fle->object->ops->delete(fle->object);
|
2014-03-10 18:09:07 +04:00
|
|
|
kmem_cache_free(flow_cachep, fle);
|
2010-04-07 04:30:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void flow_cache_gc_task(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct list_head gc_list;
|
|
|
|
struct flow_cache_entry *fce, *n;
|
2014-01-18 05:55:27 +04:00
|
|
|
struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
|
|
|
|
flow_cache_gc_work);
|
2010-04-07 04:30:07 +04:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&gc_list);
|
2014-01-18 05:55:27 +04:00
|
|
|
spin_lock_bh(&xfrm->flow_cache_gc_lock);
|
|
|
|
list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
|
|
|
|
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
|
2010-04-07 04:30:07 +04:00
|
|
|
|
2016-02-22 12:40:07 +03:00
|
|
|
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
|
2014-01-18 05:55:27 +04:00
|
|
|
flow_entry_kill(fce, xfrm);
|
2016-02-22 12:40:07 +03:00
|
|
|
atomic_dec(&xfrm->flow_cache_gc_count);
|
|
|
|
}
|
2010-04-07 04:30:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
|
2014-01-18 05:55:27 +04:00
|
|
|
int deleted, struct list_head *gc_list,
|
|
|
|
struct netns_xfrm *xfrm)
|
2010-04-07 04:30:07 +04:00
|
|
|
{
|
|
|
|
if (deleted) {
|
2016-02-22 12:40:07 +03:00
|
|
|
atomic_add(deleted, &xfrm->flow_cache_gc_count);
|
2010-04-07 04:30:07 +04:00
|
|
|
fcp->hash_count -= deleted;
|
2014-01-18 05:55:27 +04:00
|
|
|
spin_lock_bh(&xfrm->flow_cache_gc_lock);
|
|
|
|
list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
|
|
|
|
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
|
|
|
|
schedule_work(&xfrm->flow_cache_gc_work);
|
2010-04-07 04:30:07 +04:00
|
|
|
}
|
IPsec: propagate security module errors up from flow_cache_lookup
When a security module is loaded (in this case, SELinux), the
security_xfrm_policy_lookup() hook can return an access denied permission
(or other error). We were not handling that correctly, and in fact
inverting the return logic and propagating a false "ok" back up to
xfrm_lookup(), which then allowed packets to pass as if they were not
associated with an xfrm policy.
The way I was seeing the problem was when connecting via IPsec to a
confined service on an SELinux box (vsftpd), which did not have the
appropriate SELinux policy permissions to send packets via IPsec.
The first SYNACK would be blocked, because of an uncached lookup via
flow_cache_lookup(), which would fail to resolve an xfrm policy because
the SELinux policy is checked at that point via the resolver.
However, retransmitted SYNACKs would then find a cached flow entry when
calling into flow_cache_lookup() with a null xfrm policy, which is
interpreted by xfrm_lookup() as the packet not having any associated
policy and similarly to the first case, allowing it to pass without
transformation.
The solution presented here is to first ensure that errno values are
correctly propagated all the way back up through the various call chains
from security_xfrm_policy_lookup(), and handled correctly.
Then, flow_cache_lookup() is modified, so that if the policy resolver
fails (typically a permission denied via the security module), the flow
cache entry is killed rather than having a null policy assigned (which
indicates that the packet can pass freely). This also forces any future
lookups for the same flow to consult the security module (e.g. SELinux)
for current security policy (rather than, say, caching the error on the
flow cache entry).
Signed-off-by: James Morris <jmorris@namei.org>
2006-10-06 00:42:27 +04:00
|
|
|
}
|
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
static void __flow_cache_shrink(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp,
|
|
|
|
int shrink_to)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2010-04-07 04:30:07 +04:00
|
|
|
struct flow_cache_entry *fle;
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
|
|
|
struct hlist_node *tmp;
|
2010-04-07 04:30:07 +04:00
|
|
|
LIST_HEAD(gc_list);
|
|
|
|
int i, deleted = 0;
|
2014-01-18 05:55:27 +04:00
|
|
|
struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
|
|
|
|
flow_cache_global);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
for (i = 0; i < flow_cache_hash_size(fc); i++) {
|
2010-04-07 04:30:04 +04:00
|
|
|
int saved = 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
|
|
|
hlist_for_each_entry_safe(fle, tmp,
|
2010-04-07 04:30:07 +04:00
|
|
|
&fcp->hash_table[i], u.hlist) {
|
2010-04-07 04:30:04 +04:00
|
|
|
if (saved < shrink_to &&
|
2014-01-18 05:55:27 +04:00
|
|
|
flow_entry_valid(fle, xfrm)) {
|
2010-04-07 04:30:04 +04:00
|
|
|
saved++;
|
|
|
|
} else {
|
2010-04-07 04:30:07 +04:00
|
|
|
deleted++;
|
|
|
|
hlist_del(&fle->u.hlist);
|
|
|
|
list_add_tail(&fle->u.gc_list, &gc_list);
|
2010-04-07 04:30:04 +04:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
}
|
2010-04-07 04:30:07 +04:00
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
static void flow_cache_shrink(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2010-03-31 04:17:06 +04:00
|
|
|
int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
__flow_cache_shrink(fc, fcp, shrink_to);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
static void flow_new_hash_rnd(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2010-03-31 04:17:06 +04:00
|
|
|
get_random_bytes(&fcp->hash_rnd, sizeof(u32));
|
|
|
|
fcp->hash_rnd_recalc = 0;
|
|
|
|
__flow_cache_shrink(fc, fcp, 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
static u32 flow_hash_code(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp,
|
2011-09-05 20:47:24 +04:00
|
|
|
const struct flowi *key,
|
|
|
|
size_t keysize)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2011-02-23 05:44:31 +03:00
|
|
|
const u32 *k = (const u32 *) key;
|
2011-09-05 20:47:24 +04:00
|
|
|
const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-09-05 20:47:24 +04:00
|
|
|
return jhash2(k, length, fcp->hash_rnd)
|
2010-09-23 00:43:57 +04:00
|
|
|
& (flow_cache_hash_size(fc) - 1);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* I hear what you're saying, use memcmp. But memcmp cannot make
|
2011-09-05 20:47:24 +04:00
|
|
|
* important assumptions that we can here, such as alignment.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2011-09-05 20:47:24 +04:00
|
|
|
static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
|
|
|
|
size_t keysize)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2011-02-23 05:44:31 +03:00
|
|
|
const flow_compare_t *k1, *k1_lim, *k2;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-02-23 05:44:31 +03:00
|
|
|
k1 = (const flow_compare_t *) key1;
|
2011-09-05 20:47:24 +04:00
|
|
|
k1_lim = k1 + keysize;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-02-23 05:44:31 +03:00
|
|
|
k2 = (const flow_compare_t *) key2;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
do {
|
|
|
|
if (*k1++ != *k2++)
|
|
|
|
return 1;
|
|
|
|
} while (k1 < k1_lim);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-07 04:30:04 +04:00
|
|
|
struct flow_cache_object *
|
2011-02-23 05:44:31 +03:00
|
|
|
flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
|
2010-04-07 04:30:04 +04:00
|
|
|
flow_resolve_t resolver, void *ctx)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2014-01-18 05:55:27 +04:00
|
|
|
struct flow_cache *fc = &net->xfrm.flow_cache_global;
|
2010-03-31 04:17:06 +04:00
|
|
|
struct flow_cache_percpu *fcp;
|
2010-04-07 04:30:07 +04:00
|
|
|
struct flow_cache_entry *fle, *tfle;
|
2010-04-07 04:30:04 +04:00
|
|
|
struct flow_cache_object *flo;
|
2011-09-05 20:47:24 +04:00
|
|
|
size_t keysize;
|
2005-04-17 02:20:36 +04:00
|
|
|
unsigned int hash;
|
|
|
|
|
|
|
|
local_bh_disable();
|
2010-06-24 04:52:37 +04:00
|
|
|
fcp = this_cpu_ptr(fc->percpu);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
fle = NULL;
|
2010-04-07 04:30:04 +04:00
|
|
|
flo = NULL;
|
2011-09-05 20:47:24 +04:00
|
|
|
|
|
|
|
keysize = flow_key_size(family);
|
|
|
|
if (!keysize)
|
|
|
|
goto nocache;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* Packet really early in init? Making flow_cache_init a
|
|
|
|
* pre-smp initcall would solve this. --RR */
|
2010-03-31 04:17:06 +04:00
|
|
|
if (!fcp->hash_table)
|
2005-04-17 02:20:36 +04:00
|
|
|
goto nocache;
|
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
if (fcp->hash_rnd_recalc)
|
|
|
|
flow_new_hash_rnd(fc, fcp);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2011-09-05 20:47:24 +04:00
|
|
|
hash = flow_hash_code(fc, fcp, key, keysize);
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
|
|
|
hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
|
2011-08-31 10:05:27 +04:00
|
|
|
if (tfle->net == net &&
|
|
|
|
tfle->family == family &&
|
2010-04-07 04:30:07 +04:00
|
|
|
tfle->dir == dir &&
|
2011-09-05 20:47:24 +04:00
|
|
|
flow_key_compare(key, &tfle->key, keysize) == 0) {
|
2010-04-07 04:30:07 +04:00
|
|
|
fle = tfle;
|
2005-04-17 02:20:36 +04:00
|
|
|
break;
|
2010-04-07 04:30:07 +04:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2010-04-07 04:30:04 +04:00
|
|
|
if (unlikely(!fle)) {
|
2010-03-31 04:17:06 +04:00
|
|
|
if (fcp->hash_count > fc->high_watermark)
|
|
|
|
flow_cache_shrink(fc, fcp);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2016-11-21 17:48:21 +03:00
|
|
|
if (atomic_read(&net->xfrm.flow_cache_gc_count) >
|
|
|
|
2 * num_online_cpus() * fc->high_watermark) {
|
2016-02-22 12:40:07 +03:00
|
|
|
flo = ERR_PTR(-ENOBUFS);
|
|
|
|
goto ret_object;
|
|
|
|
}
|
|
|
|
|
2014-03-10 18:09:07 +04:00
|
|
|
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (fle) {
|
2011-08-31 10:05:27 +04:00
|
|
|
fle->net = net;
|
2005-04-17 02:20:36 +04:00
|
|
|
fle->family = family;
|
|
|
|
fle->dir = dir;
|
2011-09-05 20:47:24 +04:00
|
|
|
memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
|
2005-04-17 02:20:36 +04:00
|
|
|
fle->object = NULL;
|
2010-04-07 04:30:07 +04:00
|
|
|
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
|
2010-03-31 04:17:06 +04:00
|
|
|
fcp->hash_count++;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2014-01-18 05:55:27 +04:00
|
|
|
} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
|
2010-04-07 04:30:04 +04:00
|
|
|
flo = fle->object;
|
|
|
|
if (!flo)
|
|
|
|
goto ret_object;
|
|
|
|
flo = flo->ops->get(flo);
|
|
|
|
if (flo)
|
|
|
|
goto ret_object;
|
|
|
|
} else if (fle->object) {
|
|
|
|
flo = fle->object;
|
|
|
|
flo->ops->delete(flo);
|
|
|
|
fle->object = NULL;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
nocache:
|
2010-04-07 04:30:04 +04:00
|
|
|
flo = NULL;
|
|
|
|
if (fle) {
|
|
|
|
flo = fle->object;
|
|
|
|
fle->object = NULL;
|
|
|
|
}
|
|
|
|
flo = resolver(net, key, family, dir, flo, ctx);
|
|
|
|
if (fle) {
|
2014-01-18 05:55:27 +04:00
|
|
|
fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
|
2010-04-07 04:30:04 +04:00
|
|
|
if (!IS_ERR(flo))
|
|
|
|
fle->object = flo;
|
|
|
|
else
|
|
|
|
fle->genid--;
|
|
|
|
} else {
|
2013-01-22 10:32:44 +04:00
|
|
|
if (!IS_ERR_OR_NULL(flo))
|
2010-04-07 04:30:04 +04:00
|
|
|
flo->ops->delete(flo);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-04-07 04:30:04 +04:00
|
|
|
ret_object:
|
|
|
|
local_bh_enable();
|
|
|
|
return flo;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2010-07-10 01:22:04 +04:00
|
|
|
EXPORT_SYMBOL(flow_cache_lookup);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
static void flow_cache_flush_tasklet(unsigned long data)
|
|
|
|
{
|
|
|
|
struct flow_flush_info *info = (void *)data;
|
2010-03-31 04:17:06 +04:00
|
|
|
struct flow_cache *fc = info->cache;
|
|
|
|
struct flow_cache_percpu *fcp;
|
2010-04-07 04:30:07 +04:00
|
|
|
struct flow_cache_entry *fle;
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
|
|
|
struct hlist_node *tmp;
|
2010-04-07 04:30:07 +04:00
|
|
|
LIST_HEAD(gc_list);
|
|
|
|
int i, deleted = 0;
|
2014-01-18 05:55:27 +04:00
|
|
|
struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
|
|
|
|
flow_cache_global);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-06-24 04:52:37 +04:00
|
|
|
fcp = this_cpu_ptr(fc->percpu);
|
2010-03-31 04:17:06 +04:00
|
|
|
for (i = 0; i < flow_cache_hash_size(fc); i++) {
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
|
|
|
hlist_for_each_entry_safe(fle, tmp,
|
2010-04-07 04:30:07 +04:00
|
|
|
&fcp->hash_table[i], u.hlist) {
|
2014-01-18 05:55:27 +04:00
|
|
|
if (flow_entry_valid(fle, xfrm))
|
2005-04-17 02:20:36 +04:00
|
|
|
continue;
|
|
|
|
|
2010-04-07 04:30:07 +04:00
|
|
|
deleted++;
|
|
|
|
hlist_del(&fle->u.hlist);
|
|
|
|
list_add_tail(&fle->u.gc_list, &gc_list);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
|
2010-04-07 04:30:07 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
if (atomic_dec_and_test(&info->cpuleft))
|
|
|
|
complete(&info->completion);
|
|
|
|
}
|
|
|
|
|
2013-03-19 15:35:58 +04:00
|
|
|
/*
|
|
|
|
* Return whether a cpu needs flushing. Conservatively, we assume
|
|
|
|
* the presence of any entries means the core may require flushing,
|
|
|
|
* since the flow_cache_ops.check() function may assume it's running
|
|
|
|
* on the same core as the per-cpu cache component.
|
|
|
|
*/
|
|
|
|
static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
|
|
|
|
{
|
|
|
|
struct flow_cache_percpu *fcp;
|
|
|
|
int i;
|
|
|
|
|
2013-03-28 06:24:11 +04:00
|
|
|
fcp = per_cpu_ptr(fc->percpu, cpu);
|
2013-03-19 15:35:58 +04:00
|
|
|
for (i = 0; i < flow_cache_hash_size(fc); i++)
|
|
|
|
if (!hlist_empty(&fcp->hash_table[i]))
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
static void flow_cache_flush_per_cpu(void *data)
|
|
|
|
{
|
|
|
|
struct flow_flush_info *info = data;
|
|
|
|
struct tasklet_struct *tasklet;
|
|
|
|
|
2013-03-28 03:42:41 +04:00
|
|
|
tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
|
2005-04-17 02:20:36 +04:00
|
|
|
tasklet->data = (unsigned long)info;
|
|
|
|
tasklet_schedule(tasklet);
|
|
|
|
}
|
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
void flow_cache_flush(struct net *net)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct flow_flush_info info;
|
2013-03-19 15:35:58 +04:00
|
|
|
cpumask_var_t mask;
|
|
|
|
int i, self;
|
|
|
|
|
|
|
|
/* Track which cpus need flushing to avoid disturbing all cores. */
|
|
|
|
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
|
|
|
|
return;
|
|
|
|
cpumask_clear(mask);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* Don't want cpus going down or up during this. */
|
2008-01-25 23:08:02 +03:00
|
|
|
get_online_cpus();
|
2014-01-18 05:55:27 +04:00
|
|
|
mutex_lock(&net->xfrm.flow_flush_sem);
|
|
|
|
info.cache = &net->xfrm.flow_cache_global;
|
2013-03-19 15:35:58 +04:00
|
|
|
for_each_online_cpu(i)
|
|
|
|
if (!flow_cache_percpu_empty(info.cache, i))
|
|
|
|
cpumask_set_cpu(i, mask);
|
|
|
|
atomic_set(&info.cpuleft, cpumask_weight(mask));
|
|
|
|
if (atomic_read(&info.cpuleft) == 0)
|
|
|
|
goto done;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
init_completion(&info.completion);
|
|
|
|
|
|
|
|
local_bh_disable();
|
2013-03-19 15:35:58 +04:00
|
|
|
self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
|
|
|
|
on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
|
|
|
|
if (self)
|
|
|
|
flow_cache_flush_tasklet((unsigned long)&info);
|
2005-04-17 02:20:36 +04:00
|
|
|
local_bh_enable();
|
|
|
|
|
|
|
|
wait_for_completion(&info.completion);
|
2013-03-19 15:35:58 +04:00
|
|
|
|
|
|
|
done:
|
2014-01-18 05:55:27 +04:00
|
|
|
mutex_unlock(&net->xfrm.flow_flush_sem);
|
2008-01-25 23:08:02 +03:00
|
|
|
put_online_cpus();
|
2013-03-19 15:35:58 +04:00
|
|
|
free_cpumask_var(mask);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2011-12-22 01:48:08 +04:00
|
|
|
static void flow_cache_flush_task(struct work_struct *work)
|
|
|
|
{
|
2014-01-18 05:55:27 +04:00
|
|
|
struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
|
2015-02-05 18:36:50 +03:00
|
|
|
flow_cache_flush_work);
|
2014-01-18 05:55:27 +04:00
|
|
|
struct net *net = container_of(xfrm, struct net, xfrm);
|
2011-12-22 01:48:08 +04:00
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
flow_cache_flush(net);
|
|
|
|
}
|
2011-12-22 01:48:08 +04:00
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
void flow_cache_flush_deferred(struct net *net)
|
2011-12-22 01:48:08 +04:00
|
|
|
{
|
2014-01-18 05:55:27 +04:00
|
|
|
schedule_work(&net->xfrm.flow_cache_flush_work);
|
2011-12-22 01:48:08 +04:00
|
|
|
}
|
|
|
|
|
2013-06-19 22:32:33 +04:00
|
|
|
static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2010-09-10 11:00:25 +04:00
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
|
|
|
|
size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
|
2010-03-31 04:17:06 +04:00
|
|
|
|
2010-09-10 11:00:25 +04:00
|
|
|
if (!fcp->hash_table) {
|
|
|
|
fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (!fcp->hash_table) {
|
|
|
|
pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
fcp->hash_rnd_recalc = 1;
|
|
|
|
fcp->hash_count = 0;
|
|
|
|
tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
|
|
|
|
}
|
|
|
|
return 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2013-06-19 22:32:33 +04:00
|
|
|
static int flow_cache_cpu(struct notifier_block *nfb,
|
2005-04-17 02:20:36 +04:00
|
|
|
unsigned long action,
|
|
|
|
void *hcpu)
|
|
|
|
{
|
2014-01-18 05:55:27 +04:00
|
|
|
struct flow_cache *fc = container_of(nfb, struct flow_cache,
|
|
|
|
hotcpu_notifier);
|
2010-09-10 11:00:25 +04:00
|
|
|
int res, cpu = (unsigned long) hcpu;
|
2010-03-31 04:17:06 +04:00
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
|
|
|
|
|
2010-09-10 11:00:25 +04:00
|
|
|
switch (action) {
|
|
|
|
case CPU_UP_PREPARE:
|
|
|
|
case CPU_UP_PREPARE_FROZEN:
|
|
|
|
res = flow_cache_cpu_prepare(fc, cpu);
|
|
|
|
if (res)
|
|
|
|
return notifier_from_errno(res);
|
|
|
|
break;
|
|
|
|
case CPU_DEAD:
|
|
|
|
case CPU_DEAD_FROZEN:
|
2010-03-31 04:17:06 +04:00
|
|
|
__flow_cache_shrink(fc, fcp, 0);
|
2010-09-10 11:00:25 +04:00
|
|
|
break;
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
2014-01-18 05:55:27 +04:00
|
|
|
int flow_cache_init(struct net *net)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
int i;
|
2014-01-18 05:55:27 +04:00
|
|
|
struct flow_cache *fc = &net->xfrm.flow_cache_global;
|
|
|
|
|
2014-03-10 18:09:07 +04:00
|
|
|
if (!flow_cachep)
|
|
|
|
flow_cachep = kmem_cache_create("flow_cache",
|
|
|
|
sizeof(struct flow_cache_entry),
|
|
|
|
0, SLAB_PANIC, NULL);
|
2014-01-18 05:55:27 +04:00
|
|
|
spin_lock_init(&net->xfrm.flow_cache_gc_lock);
|
|
|
|
INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
|
|
|
|
INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
|
|
|
|
INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
|
|
|
|
mutex_init(&net->xfrm.flow_flush_sem);
|
2016-02-22 12:40:07 +03:00
|
|
|
atomic_set(&net->xfrm.flow_cache_gc_count, 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-03-31 04:17:06 +04:00
|
|
|
fc->hash_shift = 10;
|
|
|
|
fc->low_watermark = 2 * flow_cache_hash_size(fc);
|
|
|
|
fc->high_watermark = 4 * flow_cache_hash_size(fc);
|
|
|
|
|
|
|
|
fc->percpu = alloc_percpu(struct flow_cache_percpu);
|
2010-09-10 11:00:25 +04:00
|
|
|
if (!fc->percpu)
|
|
|
|
return -ENOMEM;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2014-03-11 00:42:51 +04:00
|
|
|
cpu_notifier_register_begin();
|
|
|
|
|
2010-09-10 11:00:25 +04:00
|
|
|
for_each_online_cpu(i) {
|
|
|
|
if (flow_cache_cpu_prepare(fc, i))
|
2011-09-28 02:51:39 +04:00
|
|
|
goto err;
|
2010-09-10 11:00:25 +04:00
|
|
|
}
|
2010-03-31 04:17:06 +04:00
|
|
|
fc->hotcpu_notifier = (struct notifier_block){
|
|
|
|
.notifier_call = flow_cache_cpu,
|
|
|
|
};
|
2014-03-11 00:42:51 +04:00
|
|
|
__register_hotcpu_notifier(&fc->hotcpu_notifier);
|
|
|
|
|
|
|
|
cpu_notifier_register_done();
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-09-10 11:00:25 +04:00
|
|
|
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
|
|
|
|
(unsigned long) fc);
|
|
|
|
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
|
|
|
|
add_timer(&fc->rnd_timer);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
return 0;
|
2011-09-28 02:51:39 +04:00
|
|
|
|
|
|
|
err:
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
|
|
|
|
kfree(fcp->hash_table);
|
|
|
|
fcp->hash_table = NULL;
|
|
|
|
}
|
|
|
|
|
2014-03-11 00:42:51 +04:00
|
|
|
cpu_notifier_register_done();
|
|
|
|
|
2011-09-28 02:51:39 +04:00
|
|
|
free_percpu(fc->percpu);
|
|
|
|
fc->percpu = NULL;
|
|
|
|
|
|
|
|
return -ENOMEM;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2014-01-18 05:55:27 +04:00
|
|
|
EXPORT_SYMBOL(flow_cache_init);
|
2014-03-12 12:43:17 +04:00
|
|
|
|
|
|
|
void flow_cache_fini(struct net *net)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct flow_cache *fc = &net->xfrm.flow_cache_global;
|
|
|
|
|
|
|
|
del_timer_sync(&fc->rnd_timer);
|
|
|
|
unregister_hotcpu_notifier(&fc->hotcpu_notifier);
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
|
|
|
|
kfree(fcp->hash_table);
|
|
|
|
fcp->hash_table = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_percpu(fc->percpu);
|
|
|
|
fc->percpu = NULL;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(flow_cache_fini);
|