2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Linux Socket Filter Data Structures
|
|
|
|
*/
|
|
|
|
#ifndef __LINUX_FILTER_H__
|
|
|
|
#define __LINUX_FILTER_H__
|
|
|
|
|
2011-07-27 03:09:06 +04:00
|
|
|
#include <linux/atomic.h>
|
2012-04-13 01:47:53 +04:00
|
|
|
#include <linux/compat.h>
|
2013-10-04 11:14:06 +04:00
|
|
|
#include <linux/workqueue.h>
|
2012-10-13 13:46:48 +04:00
|
|
|
#include <uapi/linux/filter.h>
|
2011-05-22 11:08:11 +04:00
|
|
|
|
2012-04-13 01:47:53 +04:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
/*
|
|
|
|
* A struct sock_filter is architecture independent.
|
|
|
|
*/
|
|
|
|
struct compat_sock_fprog {
|
|
|
|
u16 len;
|
|
|
|
compat_uptr_t filter; /* struct sock_filter * */
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2011-05-22 11:08:11 +04:00
|
|
|
struct sk_buff;
|
|
|
|
struct sock;
|
|
|
|
|
2008-04-10 12:33:47 +04:00
|
|
|
struct sk_filter
|
|
|
|
{
|
|
|
|
atomic_t refcnt;
|
|
|
|
unsigned int len; /* Number of filter blocks */
|
2013-10-04 11:14:06 +04:00
|
|
|
struct rcu_head rcu;
|
2011-04-20 13:27:32 +04:00
|
|
|
unsigned int (*bpf_func)(const struct sk_buff *skb,
|
|
|
|
const struct sock_filter *filter);
|
2013-10-04 11:14:06 +04:00
|
|
|
union {
|
|
|
|
struct sock_filter insns[0];
|
|
|
|
struct work_struct work;
|
|
|
|
};
|
2008-04-10 12:33:47 +04:00
|
|
|
};
|
|
|
|
|
2013-10-04 11:14:06 +04:00
|
|
|
static inline unsigned int sk_filter_size(unsigned int proglen)
|
2008-04-10 12:33:47 +04:00
|
|
|
{
|
2013-10-04 11:14:06 +04:00
|
|
|
return max(sizeof(struct sk_filter),
|
|
|
|
offsetof(struct sk_filter, insns[proglen]));
|
2008-04-10 12:33:47 +04:00
|
|
|
}
|
|
|
|
|
2008-04-10 12:43:09 +04:00
|
|
|
extern int sk_filter(struct sock *sk, struct sk_buff *skb);
|
2010-12-06 23:50:09 +03:00
|
|
|
extern unsigned int sk_run_filter(const struct sk_buff *skb,
|
2010-11-19 20:49:59 +03:00
|
|
|
const struct sock_filter *filter);
|
2012-03-31 15:01:19 +04:00
|
|
|
extern int sk_unattached_filter_create(struct sk_filter **pfp,
|
|
|
|
struct sock_fprog *fprog);
|
|
|
|
extern void sk_unattached_filter_destroy(struct sk_filter *fp);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
|
2007-10-18 08:21:26 +04:00
|
|
|
extern int sk_detach_filter(struct sock *sk);
|
2011-10-18 01:04:20 +04:00
|
|
|
extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
|
sk-filter: Add ability to get socket filter program (v2)
The SO_ATTACH_FILTER option is set only. I propose to add the get
ability by using SO_ATTACH_FILTER in getsockopt. To be less
irritating to eyes the SO_GET_FILTER alias to it is declared. This
ability is required by checkpoint-restore project to be able to
save full state of a socket.
There are two issues with getting filter back.
First, kernel modifies the sock_filter->code on filter load, thus in
order to return the filter element back to user we have to decode it
into user-visible constants. Fortunately the modification in question
is interconvertible.
Second, the BPF_S_ALU_DIV_K code modifies the command argument k to
speed up the run-time division by doing kernel_k = reciprocal(user_k).
Bad news is that different user_k may result in same kernel_k, so we
can't get the original user_k back. Good news is that we don't have
to do it. What we need to is calculate a user2_k so, that
reciprocal(user2_k) == reciprocal(user_k) == kernel_k
i.e. if it's re-loaded back the compiled again value will be exactly
the same as it was. That said, the user2_k can be calculated like this
user2_k = reciprocal(kernel_k)
with an exception, that if kernel_k == 0, then user2_k == 1.
The optlen argument is treated like this -- when zero, kernel returns
the amount of sock_fprog elements in filter, otherwise it should be
large enough for the sock_fprog array.
changes since v1:
* Declared SO_GET_FILTER in all arch headers
* Added decode of vlan-tag codes
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-01 06:01:48 +04:00
|
|
|
extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len);
|
2013-06-05 17:30:55 +04:00
|
|
|
extern void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
|
2011-04-20 13:27:32 +04:00
|
|
|
|
|
|
|
#ifdef CONFIG_BPF_JIT
|
2013-05-02 00:24:08 +04:00
|
|
|
#include <stdarg.h>
|
2013-03-28 19:24:53 +04:00
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <linux/printk.h>
|
|
|
|
|
2011-04-20 13:27:32 +04:00
|
|
|
extern void bpf_jit_compile(struct sk_filter *fp);
|
|
|
|
extern void bpf_jit_free(struct sk_filter *fp);
|
2013-03-22 01:22:03 +04:00
|
|
|
|
|
|
|
static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
|
|
|
|
u32 pass, void *image)
|
|
|
|
{
|
2013-05-17 20:57:37 +04:00
|
|
|
pr_err("flen=%u proglen=%u pass=%u image=%pK\n",
|
2013-03-22 01:22:03 +04:00
|
|
|
flen, proglen, pass, image);
|
|
|
|
if (image)
|
2013-05-17 20:57:37 +04:00
|
|
|
print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
|
2013-03-22 01:22:03 +04:00
|
|
|
16, 1, image, proglen, false);
|
|
|
|
}
|
2011-04-20 13:27:32 +04:00
|
|
|
#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
|
|
|
|
#else
|
2013-10-04 11:14:06 +04:00
|
|
|
#include <linux/slab.h>
|
2011-04-20 13:27:32 +04:00
|
|
|
static inline void bpf_jit_compile(struct sk_filter *fp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void bpf_jit_free(struct sk_filter *fp)
|
|
|
|
{
|
2013-10-04 11:14:06 +04:00
|
|
|
kfree(fp);
|
2011-04-20 13:27:32 +04:00
|
|
|
}
|
|
|
|
#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
|
|
|
|
#endif
|
|
|
|
|
2014-01-17 20:09:45 +04:00
|
|
|
static inline int bpf_tell_extensions(void)
|
|
|
|
{
|
net: filter: let bpf_tell_extensions return SKF_AD_MAX
Michal Sekletar added in commit ea02f9411d9f ("net: introduce
SO_BPF_EXTENSIONS") a facility where user space can enquire
the BPF ancillary instruction set, which is imho a step into
the right direction for letting user space high-level to BPF
optimizers make an informed decision for possibly using these
extensions.
The original rationale was to return through a getsockopt(2)
a bitfield of which instructions are supported and which
are not, as of right now, we just return 0 to indicate a
base support for SKF_AD_PROTOCOL up to SKF_AD_PAY_OFFSET.
Limitations of this approach are that this API which we need
to maintain for a long time can only support a maximum of 32
extensions, and needs to be additionally maintained/updated
when each new extension that comes in.
I thought about this a bit more and what we can do here to
overcome this is to just return SKF_AD_MAX. Since we never
remove any extension since we cannot break user space and
always linearly increase SKF_AD_MAX on each newly added
extension, user space can make a decision on what extensions
are supported in the whole set of extensions and which aren't,
by just checking which of them from the whole set have an
offset < SKF_AD_MAX of the underlying kernel.
Since SKF_AD_MAX must be updated each time we add new ones,
we don't need to introduce an additional enum and got
maintenance for free. At some point in time when
SO_BPF_EXTENSIONS becomes ubiquitous for most kernels, then
an application can simply make use of this and easily be run
on newer or older underlying kernels without needing to be
recompiled, of course. Since that is for 3.14, it's not too
late to do this change.
Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Michal Sekletar <msekleta@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-21 03:19:37 +04:00
|
|
|
return SKF_AD_MAX;
|
2014-01-17 20:09:45 +04:00
|
|
|
}
|
|
|
|
|
2011-04-20 13:27:32 +04:00
|
|
|
enum {
|
|
|
|
BPF_S_RET_K = 1,
|
|
|
|
BPF_S_RET_A,
|
|
|
|
BPF_S_ALU_ADD_K,
|
|
|
|
BPF_S_ALU_ADD_X,
|
|
|
|
BPF_S_ALU_SUB_K,
|
|
|
|
BPF_S_ALU_SUB_X,
|
|
|
|
BPF_S_ALU_MUL_K,
|
|
|
|
BPF_S_ALU_MUL_X,
|
|
|
|
BPF_S_ALU_DIV_X,
|
2012-09-08 02:03:35 +04:00
|
|
|
BPF_S_ALU_MOD_K,
|
|
|
|
BPF_S_ALU_MOD_X,
|
2011-04-20 13:27:32 +04:00
|
|
|
BPF_S_ALU_AND_K,
|
|
|
|
BPF_S_ALU_AND_X,
|
|
|
|
BPF_S_ALU_OR_K,
|
|
|
|
BPF_S_ALU_OR_X,
|
2012-09-24 06:23:59 +04:00
|
|
|
BPF_S_ALU_XOR_K,
|
|
|
|
BPF_S_ALU_XOR_X,
|
2011-04-20 13:27:32 +04:00
|
|
|
BPF_S_ALU_LSH_K,
|
|
|
|
BPF_S_ALU_LSH_X,
|
|
|
|
BPF_S_ALU_RSH_K,
|
|
|
|
BPF_S_ALU_RSH_X,
|
|
|
|
BPF_S_ALU_NEG,
|
|
|
|
BPF_S_LD_W_ABS,
|
|
|
|
BPF_S_LD_H_ABS,
|
|
|
|
BPF_S_LD_B_ABS,
|
|
|
|
BPF_S_LD_W_LEN,
|
|
|
|
BPF_S_LD_W_IND,
|
|
|
|
BPF_S_LD_H_IND,
|
|
|
|
BPF_S_LD_B_IND,
|
|
|
|
BPF_S_LD_IMM,
|
|
|
|
BPF_S_LDX_W_LEN,
|
|
|
|
BPF_S_LDX_B_MSH,
|
|
|
|
BPF_S_LDX_IMM,
|
|
|
|
BPF_S_MISC_TAX,
|
|
|
|
BPF_S_MISC_TXA,
|
|
|
|
BPF_S_ALU_DIV_K,
|
|
|
|
BPF_S_LD_MEM,
|
|
|
|
BPF_S_LDX_MEM,
|
|
|
|
BPF_S_ST,
|
|
|
|
BPF_S_STX,
|
|
|
|
BPF_S_JMP_JA,
|
|
|
|
BPF_S_JMP_JEQ_K,
|
|
|
|
BPF_S_JMP_JEQ_X,
|
|
|
|
BPF_S_JMP_JGE_K,
|
|
|
|
BPF_S_JMP_JGE_X,
|
|
|
|
BPF_S_JMP_JGT_K,
|
|
|
|
BPF_S_JMP_JGT_X,
|
|
|
|
BPF_S_JMP_JSET_K,
|
|
|
|
BPF_S_JMP_JSET_X,
|
|
|
|
/* Ancillary data */
|
|
|
|
BPF_S_ANC_PROTOCOL,
|
|
|
|
BPF_S_ANC_PKTTYPE,
|
|
|
|
BPF_S_ANC_IFINDEX,
|
|
|
|
BPF_S_ANC_NLATTR,
|
|
|
|
BPF_S_ANC_NLATTR_NEST,
|
|
|
|
BPF_S_ANC_MARK,
|
|
|
|
BPF_S_ANC_QUEUE,
|
|
|
|
BPF_S_ANC_HATYPE,
|
|
|
|
BPF_S_ANC_RXHASH,
|
|
|
|
BPF_S_ANC_CPU,
|
2012-03-31 15:01:20 +04:00
|
|
|
BPF_S_ANC_ALU_XOR_X,
|
2012-04-13 01:47:52 +04:00
|
|
|
BPF_S_ANC_SECCOMP_LD_W,
|
2012-10-27 06:26:17 +04:00
|
|
|
BPF_S_ANC_VLAN_TAG,
|
|
|
|
BPF_S_ANC_VLAN_TAG_PRESENT,
|
filter: add ANC_PAY_OFFSET instruction for loading payload start offset
It is very useful to do dynamic truncation of packets. In particular,
we're interested to push the necessary header bytes to the user space and
cut off user payload that should probably not be transferred for some reasons
(e.g. privacy, speed, or others). With the ancillary extension PAY_OFFSET,
we can load it into the accumulator, and return it. E.g. in bpfc syntax ...
ld #poff ; { 0x20, 0, 0, 0xfffff034 },
ret a ; { 0x16, 0, 0, 0x00000000 },
... as a filter will accomplish this without having to do a big hackery in
a BPF filter itself. Follow-up JIT implementations are welcome.
Thanks to Eric Dumazet for suggesting and discussing this during the
Netfilter Workshop in Copenhagen.
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-03-19 10:39:31 +04:00
|
|
|
BPF_S_ANC_PAY_OFFSET,
|
2011-04-20 13:27:32 +04:00
|
|
|
};
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif /* __LINUX_FILTER_H__ */
|