From fa697140f9a20119a9ec8fd7460cc4314fbdaff3 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Thu, 5 Apr 2018 11:53:02 +0200 Subject: [PATCH] syscalls/x86: Use 'struct pt_regs' based syscall calling convention for 64-bit syscalls Let's make use of ARCH_HAS_SYSCALL_WRAPPER=y on pure 64-bit x86-64 systems: Each syscall defines a stub which takes struct pt_regs as its only argument. It decodes just those parameters it needs, e.g: asmlinkage long sys_xyzzy(const struct pt_regs *regs) { return SyS_xyzzy(regs->di, regs->si, regs->dx); } This approach avoids leaking random user-provided register content down the call chain. For example, for sys_recv() which is a 4-parameter syscall, the assembly now is (in slightly reordered fashion): : callq <__fentry__> /* decode regs->di, ->si, ->dx and ->r10 */ mov 0x70(%rdi),%rdi mov 0x68(%rdi),%rsi mov 0x60(%rdi),%rdx mov 0x38(%rdi),%rcx [ SyS_recv() is automatically inlined by the compiler, as it is not [yet] used anywhere else ] /* clear %r9 and %r8, the 5th and 6th args */ xor %r9d,%r9d xor %r8d,%r8d /* do the actual work */ callq __sys_recvfrom /* cleanup and return */ cltq retq The only valid place in an x86-64 kernel which rightfully calls a syscall function on its own -- vsyscall -- needs to be modified to pass struct pt_regs onwards as well. To keep the syscall table generation working independent of SYSCALL_PTREGS being enabled, the stubs are named the same as the "original" syscall stubs, i.e. sys_*(). This patch is based on an original proof-of-concept | From: Linus Torvalds | Signed-off-by: Linus Torvalds and was split up and heavily modified by me, in particular to base it on ARCH_HAS_SYSCALL_WRAPPER, to limit it to 64-bit-only for the time being, and to update the vsyscall to the new calling convention. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180405095307.3730-4-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 ++ arch/x86/entry/common.c | 4 ++ arch/x86/entry/syscall_64.c | 9 +++- arch/x86/entry/vsyscall/vsyscall_64.c | 22 ++++++++ arch/x86/include/asm/syscall.h | 4 ++ arch/x86/include/asm/syscall_wrapper.h | 70 ++++++++++++++++++++++++++ arch/x86/include/asm/syscalls.h | 7 +++ include/linux/syscalls.h | 2 +- 8 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 arch/x86/include/asm/syscall_wrapper.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 27fede438959..67348efc2540 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2954,3 +2954,8 @@ source "crypto/Kconfig" source "arch/x86/kvm/Kconfig" source "lib/Kconfig" + +config SYSCALL_PTREGS + def_bool y + depends on X86_64 && !COMPAT + select ARCH_HAS_SYSCALL_WRAPPER diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a8b066dbbf48..e1b91bffa988 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -284,9 +284,13 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); +#ifdef CONFIG_SYSCALL_PTREGS + regs->ax = sys_call_table[nr](regs); +#else regs->ax = sys_call_table[nr]( regs->di, regs->si, regs->dx, regs->r10, regs->r8, regs->r9); +#endif } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index c176d2fab1da..6197850adf91 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -7,14 +7,19 @@ #include #include +#ifdef CONFIG_SYSCALL_PTREGS +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); +#else /* CONFIG_SYSCALL_PTREGS */ +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_SYSCALL_PTREGS */ #include #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, -extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); - asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 317be365bce3..05eebbf9b989 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -127,6 +127,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_err; long ret; +#ifdef CONFIG_SYSCALL_PTREGS + unsigned long orig_dx; +#endif /* * No point in checking CS -- the only way to get here is a user mode @@ -227,19 +230,38 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ret = -EFAULT; switch (vsyscall_nr) { case 0: +#ifdef CONFIG_SYSCALL_PTREGS + /* this decodes regs->di and regs->si on its own */ + ret = sys_gettimeofday(regs); +#else ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); +#endif /* CONFIG_SYSCALL_PTREGS */ break; case 1: +#ifdef CONFIG_SYSCALL_PTREGS + /* this decodes regs->di on its own */ + ret = sys_time(regs); +#else ret = sys_time((time_t __user *)regs->di); +#endif /* CONFIG_SYSCALL_PTREGS */ break; case 2: +#ifdef CONFIG_SYSCALL_PTREGS + /* while we could clobber regs->dx, we didn't in the past... */ + orig_dx = regs->dx; + regs->dx = 0; + /* this decodes regs->di, regs->si and regs->dx on its own */ + ret = sys_getcpu(regs); + regs->dx = orig_dx; +#else ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); +#endif /* CONFIG_SYSCALL_PTREGS */ break; } diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 03eedc21246d..17c62373a6f9 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,9 +20,13 @@ #include /* for TS_COMPAT */ #include +#ifdef CONFIG_SYSCALL_PTREGS +typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *); +#else typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_SYSCALL_PTREGS */ extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h new file mode 100644 index 000000000000..702bdee377af --- /dev/null +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * syscall_wrapper.h - x86 specific wrappers to syscall definitions + */ + +#ifndef _ASM_X86_SYSCALL_WRAPPER_H +#define _ASM_X86_SYSCALL_WRAPPER_H + +/* + * Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes + * struct pt_regs *regs as the only argument of the syscall stub named + * sys_*(). It decodes just the registers it needs and passes them on to + * the SyS_*() wrapper and then to the SYSC_*() function doing the actual job. + * These wrappers and functions are inlined, meaning that the assembly looks + * as follows (slightly re-ordered): + * + * : <-- syscall with 4 parameters + * callq <__fentry__> + * + * mov 0x70(%rdi),%rdi <-- decode regs->di + * mov 0x68(%rdi),%rsi <-- decode regs->si + * mov 0x60(%rdi),%rdx <-- decode regs->dx + * mov 0x38(%rdi),%rcx <-- decode regs->r10 + * + * xor %r9d,%r9d <-- clear %r9 + * xor %r8d,%r8d <-- clear %r8 + * + * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom() + * which takes 6 arguments + * + * cltq <-- extend return value to 64-bit + * retq <-- return + * + * This approach avoids leaking random user-provided register content down + * the call chain. + * + * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for + * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not + * hurt, there is no need to override it. + */ +#define __SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(sys##name, ERRNO); \ + static long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long sys##name(const struct pt_regs *regs) \ + { \ + return SyS##name(__MAP(x,__SC_ARGS \ + ,,regs->di,,regs->si,,regs->dx \ + ,,regs->r10,,regs->r8,,regs->r9)); \ + } \ + static long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +/* + * For VSYSCALLS, we need to declare these three syscalls with the new + * pt_regs-based calling convention for in-kernel use. + */ +struct pt_regs; +asmlinkage long sys_getcpu(const struct pt_regs *regs); /* di,si,dx */ +asmlinkage long sys_gettimeofday(const struct pt_regs *regs); /* di,si */ +asmlinkage long sys_time(const struct pt_regs *regs); /* di */ + +#endif /* _ASM_X86_SYSCALL_WRAPPER_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index ae6e05fdc24b..e4ad93c05f02 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -18,6 +18,12 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on); + +#ifndef CONFIG_SYSCALL_PTREGS +/* + * If CONFIG_SYSCALL_PTREGS is enabled, a different syscall calling convention + * is used. Do not include these -- invalid -- prototypes then + */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); @@ -53,4 +59,5 @@ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #endif /* CONFIG_X86_32 */ +#endif /* CONFIG_SYSCALL_PTREGS */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 503ab245d4ce..d7168b3a4b4c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -102,7 +102,7 @@ union bpf_attr; * for SYSCALL_DEFINE/COMPAT_SYSCALL_DEFINE */ #define __MAP0(m,...) -#define __MAP1(m,t,a) m(t,a) +#define __MAP1(m,t,a,...) m(t,a) #define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__) #define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__) #define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)