bpf: Add support for BTF pointers to x86 JIT

Pointer to BTF object is a pointer to kernel object or NULL.
Such pointers can only be used by BPF_LDX instructions.
The verifier changed their opcode from LDX|MEM|size
to LDX|PROBE_MEM|size to make JITing easier.
The number of entries in extable is the number of BPF_LDX insns
that access kernel memory via "pointer to BTF type".
Only these load instructions can fault.
Since x86 extable is relative it has to be allocated in the same
memory region as JITed code.
Allocate it prior to last pass of JITing and let the last pass populate it.
Pointer to extable in bpf_prog_aux is necessary to make page fault
handling fast.
Page fault handling is done in two steps:
1. bpf_prog_kallsyms_find() finds BPF program that page faulted.
   It's done by walking rb tree.
2. then extable for given bpf program is binary searched.
This process is similar to how page faulting is done for kernel modules.
The exception handler skips over faulting x86 instruction and
initializes destination register with zero. This mimics exact
behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed).

JITs for other architectures can add support in similar way.
Until then they will reject unknown opcode and fallback to interpreter.

Since extable should be aligned and placed near JITed code
make bpf_jit_binary_alloc() return 4 byte aligned image offset,
so that extable aligning formula in bpf_int_jit_compile() doesn't need
to rely on internal implementation of bpf_jit_binary_alloc().
On x86 gcc defaults to 16-byte alignment for regular kernel functions
due to better performance. JITed code may be aligned to 16 in the future,
but it will use 4 in the meantime.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org
This commit is contained in:
Alexei Starovoitov 2019-10-15 20:25:03 -07:00 коммит произвёл Daniel Borkmann
Родитель 2a02759ef5
Коммит 3dec541b2e
6 изменённых файлов: 128 добавлений и 5 удалений

Просмотреть файл

@ -9,7 +9,7 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/if_vlan.h> #include <linux/if_vlan.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <asm/extable.h>
#include <asm/set_memory.h> #include <asm/set_memory.h>
#include <asm/nospec-branch.h> #include <asm/nospec-branch.h>
@ -123,6 +123,19 @@ static const int reg2hex[] = {
[AUX_REG] = 3, /* R11 temp register */ [AUX_REG] = 3, /* R11 temp register */
}; };
static const int reg2pt_regs[] = {
[BPF_REG_0] = offsetof(struct pt_regs, ax),
[BPF_REG_1] = offsetof(struct pt_regs, di),
[BPF_REG_2] = offsetof(struct pt_regs, si),
[BPF_REG_3] = offsetof(struct pt_regs, dx),
[BPF_REG_4] = offsetof(struct pt_regs, cx),
[BPF_REG_5] = offsetof(struct pt_regs, r8),
[BPF_REG_6] = offsetof(struct pt_regs, bx),
[BPF_REG_7] = offsetof(struct pt_regs, r13),
[BPF_REG_8] = offsetof(struct pt_regs, r14),
[BPF_REG_9] = offsetof(struct pt_regs, r15),
};
/* /*
* is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15 * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
* which need extra byte of encoding. * which need extra byte of encoding.
@ -377,6 +390,19 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog; *pprog = prog;
} }
static bool ex_handler_bpf(const struct exception_table_entry *x,
struct pt_regs *regs, int trapnr,
unsigned long error_code, unsigned long fault_addr)
{
u32 reg = x->fixup >> 8;
/* jump over faulting load and clear dest register */
*(unsigned long *)((void *)regs + reg) = 0;
regs->ip += x->fixup & 0xff;
return true;
}
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx) int oldproglen, struct jit_context *ctx)
{ {
@ -384,7 +410,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int insn_cnt = bpf_prog->len; int insn_cnt = bpf_prog->len;
bool seen_exit = false; bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0; int i, cnt = 0, excnt = 0;
int proglen = 0; int proglen = 0;
u8 *prog = temp; u8 *prog = temp;
@ -778,14 +804,17 @@ stx: if (is_imm8(insn->off))
/* LDX: dst_reg = *(u8*)(src_reg + off) */ /* LDX: dst_reg = *(u8*)(src_reg + off) */
case BPF_LDX | BPF_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_B:
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
/* Emit 'movzx rax, byte ptr [rax + off]' */ /* Emit 'movzx rax, byte ptr [rax + off]' */
EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
goto ldx; goto ldx;
case BPF_LDX | BPF_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_H:
case BPF_LDX | BPF_PROBE_MEM | BPF_H:
/* Emit 'movzx rax, word ptr [rax + off]' */ /* Emit 'movzx rax, word ptr [rax + off]' */
EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
goto ldx; goto ldx;
case BPF_LDX | BPF_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_W:
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
/* Emit 'mov eax, dword ptr [rax+0x14]' */ /* Emit 'mov eax, dword ptr [rax+0x14]' */
if (is_ereg(dst_reg) || is_ereg(src_reg)) if (is_ereg(dst_reg) || is_ereg(src_reg))
EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
@ -793,6 +822,7 @@ stx: if (is_imm8(insn->off))
EMIT1(0x8B); EMIT1(0x8B);
goto ldx; goto ldx;
case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
/* Emit 'mov rax, qword ptr [rax+0x14]' */ /* Emit 'mov rax, qword ptr [rax+0x14]' */
EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
ldx: /* ldx: /*
@ -805,6 +835,48 @@ ldx: /*
else else
EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
insn->off); insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen;
s64 delta;
if (!bpf_prog->aux->extable)
break;
if (excnt >= bpf_prog->aux->num_exentries) {
pr_err("ex gen bug\n");
return -EFAULT;
}
ex = &bpf_prog->aux->extable[excnt++];
delta = _insn - (u8 *)&ex->insn;
if (!is_simm32(delta)) {
pr_err("extable->insn doesn't fit into 32-bit\n");
return -EFAULT;
}
ex->insn = delta;
delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
if (!is_simm32(delta)) {
pr_err("extable->handler doesn't fit into 32-bit\n");
return -EFAULT;
}
ex->handler = delta;
if (dst_reg > BPF_REG_9) {
pr_err("verifier error\n");
return -EFAULT;
}
/*
* Compute size of x86 insn and its target dest x86 register.
* ex_handler_bpf() will use lower 8 bits to adjust
* pt_regs->ip to jump over this x86 instruction
* and upper bits to figure out which pt_regs to zero out.
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
}
break; break;
/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
@ -1058,6 +1130,11 @@ emit_jmp:
addrs[i] = proglen; addrs[i] = proglen;
prog = temp; prog = temp;
} }
if (image && excnt != bpf_prog->aux->num_exentries) {
pr_err("extable is not populated\n");
return -EFAULT;
}
return proglen; return proglen;
} }
@ -1158,12 +1235,24 @@ out_image:
break; break;
} }
if (proglen == oldproglen) { if (proglen == oldproglen) {
header = bpf_jit_binary_alloc(proglen, &image, /*
1, jit_fill_hole); * The number of entries in extable is the number of BPF_LDX
* insns that access kernel memory via "pointer to BTF type".
* The verifier changed their opcode from LDX|MEM|size
* to LDX|PROBE_MEM|size to make JITing easier.
*/
u32 align = __alignof__(struct exception_table_entry);
u32 extable_size = prog->aux->num_exentries *
sizeof(struct exception_table_entry);
/* allocate module memory for x86 insns and extable */
header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
&image, align, jit_fill_hole);
if (!header) { if (!header) {
prog = orig_prog; prog = orig_prog;
goto out_addrs; goto out_addrs;
} }
prog->aux->extable = (void *) image + roundup(proglen, align);
} }
oldproglen = proglen; oldproglen = proglen;
cond_resched(); cond_resched();

Просмотреть файл

@ -24,6 +24,7 @@ struct sock;
struct seq_file; struct seq_file;
struct btf; struct btf;
struct btf_type; struct btf_type;
struct exception_table_entry;
extern struct idr btf_idr; extern struct idr btf_idr;
extern spinlock_t btf_idr_lock; extern spinlock_t btf_idr_lock;
@ -423,6 +424,8 @@ struct bpf_prog_aux {
* main prog always has linfo_idx == 0 * main prog always has linfo_idx == 0
*/ */
u32 linfo_idx; u32 linfo_idx;
u32 num_exentries;
struct exception_table_entry *extable;
struct bpf_prog_stats __percpu *stats; struct bpf_prog_stats __percpu *stats;
union { union {
struct work_struct work; struct work_struct work;

Просмотреть файл

@ -33,4 +33,14 @@ search_module_extables(unsigned long addr)
} }
#endif /*CONFIG_MODULES*/ #endif /*CONFIG_MODULES*/
#ifdef CONFIG_BPF_JIT
const struct exception_table_entry *search_bpf_extables(unsigned long addr);
#else
static inline const struct exception_table_entry *
search_bpf_extables(unsigned long addr)
{
return NULL;
}
#endif
#endif /* _LINUX_EXTABLE_H */ #endif /* _LINUX_EXTABLE_H */

Просмотреть файл

@ -30,7 +30,7 @@
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <linux/extable.h>
#include <asm/unaligned.h> #include <asm/unaligned.h>
/* Registers */ /* Registers */
@ -712,6 +712,24 @@ bool is_bpf_text_address(unsigned long addr)
return ret; return ret;
} }
const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
const struct exception_table_entry *e = NULL;
struct bpf_prog *prog;
rcu_read_lock();
prog = bpf_prog_kallsyms_find(addr);
if (!prog)
goto out;
if (!prog->aux->num_exentries)
goto out;
e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
out:
rcu_read_unlock();
return e;
}
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym) char *sym)
{ {

Просмотреть файл

@ -8729,6 +8729,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return -EINVAL; return -EINVAL;
} }
insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
continue; continue;
default: default:
continue; continue;

Просмотреть файл

@ -56,6 +56,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
e = search_kernel_exception_table(addr); e = search_kernel_exception_table(addr);
if (!e) if (!e)
e = search_module_extables(addr); e = search_module_extables(addr);
if (!e)
e = search_bpf_extables(addr);
return e; return e;
} }