Misc changes:
- Unexport various PAT primitives - Unexport per-CPU tlbstate Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAl7Z+3cRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1jgyxAAjPoXEzi9rqGHY6Eus37DNbzHtdQj4fqN 68h8T2tSnOMzETe3L/c4puxI50YFpMA0sFbzm8BfjCtucs0K7Tj4Sv8Aoap2b99A /bP+ySgHh2BMoI/tu9TiD8et+vttAGGwkXQhIOgeakZcYzpAY7oUNwc+CogkytbQ DaC8s9FL7RjCXCL91fvZ33C0ksg5J9ynFbRozEHOacHPrE3CbrqUwu+75PmS7nJC 13vatOxjdqNPQhVMg7waN1nHv7K06kph1wxWxYHoD0QwAPy1ecE84wLvg9gv5AqK BfUBmB34qRW21qbB5tQrMlGDS9tuV0vUB1fxUV7/iOKXQUH6viEG/7J7jm+YwXji U9S54UPj/TOp8fvYdS18sp6vI1gS3HKjd3LO3pPHWsyZVMJBoGuMConZRs3C31Cp WuwBU1gY+mFB5l4prt8WU8ocPvEnZkP00cCYNyzPk21tblfUwFbrmu3wcZxOkx3s ZhRO4KrhxtL7l/wDLuNtWShBL2c6Rz2tts58tr/fj/M+UscJK2MPKxPLCAb20QYZ qSkMa36+r8LkuMCyjpegEEmo4sw9yC6aLXFKfYu2ABki5o9AR4tavk+lwO+dad6T k0DJjGXLsG9sReR6hrfaNTk5h7ImiRFDVntnWAhgKhARRoloJJS4/RkzW+ylPbac mTuNNJDChUQ= =RXKK -----END PGP SIGNATURE----- Merge tag 'x86-mm-2020-06-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 mm updates from Ingo Molnar: "Misc changes: - Unexport various PAT primitives - Unexport per-CPU tlbstate and uninline TLB helpers" * tag 'x86-mm-2020-06-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits) x86/tlb/uv: Add a forward declaration for struct flush_tlb_info x86/cpu: Export native_write_cr4() only when CONFIG_LKTDM=m x86/tlb: Restrict access to tlbstate xen/privcmd: Remove unneeded asm/tlb.h include x86/tlb: Move PCID helpers where they are used x86/tlb: Uninline nmi_uaccess_okay() x86/tlb: Move cr4_set_bits_and_update_boot() to the usage site x86/tlb: Move paravirt_tlb_remove_table() to the usage site x86/tlb: Move __flush_tlb_all() out of line x86/tlb: Move flush_tlb_others() out of line x86/tlb: Move __flush_tlb_one_kernel() out of line x86/tlb: Move __flush_tlb_one_user() out of line x86/tlb: Move __flush_tlb_global() out of line x86/tlb: Move __flush_tlb() out of line x86/alternatives: Move temporary_mm helpers into C x86/cr4: Sanitize CR4.PCE update x86/cpu: Uninline CR4 accessors x86/tlb: Uninline __get_current_cr3_fast() x86/mm: Use pgprotval_t in protval_4k_2_large() and protval_large_2_4k() x86/mm: Unexport __cachemode2pte_tbl ...
This commit is contained in:
Коммит
f4dd60a3d4
|
@ -2166,11 +2166,6 @@ static int x86_pmu_event_init(struct perf_event *event)
|
|||
return err;
|
||||
}
|
||||
|
||||
static void refresh_pce(void *ignored)
|
||||
{
|
||||
load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm));
|
||||
}
|
||||
|
||||
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
|
||||
{
|
||||
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
|
||||
|
@ -2189,7 +2184,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
|
|||
lockdep_assert_held_write(&mm->mmap_sem);
|
||||
|
||||
if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
|
||||
on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
|
||||
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
|
||||
}
|
||||
|
||||
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
|
||||
|
@ -2199,7 +2194,7 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m
|
|||
return;
|
||||
|
||||
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
|
||||
on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
|
||||
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
|
||||
}
|
||||
|
||||
static int x86_pmu_event_idx(struct perf_event *event)
|
||||
|
@ -2257,7 +2252,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
|
|||
else if (x86_pmu.attr_rdpmc == 2)
|
||||
static_branch_dec(&rdpmc_always_available_key);
|
||||
|
||||
on_each_cpu(refresh_pce, NULL, 1);
|
||||
on_each_cpu(cr4_update_pce, NULL, 1);
|
||||
x86_pmu.attr_rdpmc = val;
|
||||
}
|
||||
|
||||
|
|
|
@ -24,4 +24,7 @@ extern void memtype_free_io(resource_size_t start, resource_size_t end);
|
|||
|
||||
extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
|
||||
|
||||
bool x86_has_pat_wp(void);
|
||||
enum page_cache_mode pgprot2cachemode(pgprot_t pgprot);
|
||||
|
||||
#endif /* _ASM_X86_MEMTYPE_H */
|
||||
|
|
|
@ -24,21 +24,9 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
|
|||
#endif /* !CONFIG_PARAVIRT_XXL */
|
||||
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
|
||||
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
|
||||
|
||||
static inline void load_mm_cr4_irqsoff(struct mm_struct *mm)
|
||||
{
|
||||
if (static_branch_unlikely(&rdpmc_always_available_key) ||
|
||||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
|
||||
atomic_read(&mm->context.perf_rdpmc_allowed)))
|
||||
cr4_set_bits_irqsoff(X86_CR4_PCE);
|
||||
else
|
||||
cr4_clear_bits_irqsoff(X86_CR4_PCE);
|
||||
}
|
||||
#else
|
||||
static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {}
|
||||
void cr4_update_pce(void *ignored);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
|
@ -225,78 +213,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
|||
return __pkru_allows_pkey(vma_pkey(vma), write);
|
||||
}
|
||||
|
||||
/*
|
||||
* This can be used from process context to figure out what the value of
|
||||
* CR3 is without needing to do a (slow) __read_cr3().
|
||||
*
|
||||
* It's intended to be used for code like KVM that sneakily changes CR3
|
||||
* and needs to restore it. It needs to be used very carefully.
|
||||
*/
|
||||
static inline unsigned long __get_current_cr3_fast(void)
|
||||
{
|
||||
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
|
||||
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
||||
|
||||
/* For now, be very restrictive about when this can be called. */
|
||||
VM_WARN_ON(in_nmi() || preemptible());
|
||||
|
||||
VM_BUG_ON(cr3 != __read_cr3());
|
||||
return cr3;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
struct mm_struct *mm;
|
||||
} temp_mm_state_t;
|
||||
|
||||
/*
|
||||
* Using a temporary mm allows to set temporary mappings that are not accessible
|
||||
* by other CPUs. Such mappings are needed to perform sensitive memory writes
|
||||
* that override the kernel memory protections (e.g., W^X), without exposing the
|
||||
* temporary page-table mappings that are required for these write operations to
|
||||
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
|
||||
* mapping is torn down.
|
||||
*
|
||||
* Context: The temporary mm needs to be used exclusively by a single core. To
|
||||
* harden security IRQs must be disabled while the temporary mm is
|
||||
* loaded, thereby preventing interrupt handler bugs from overriding
|
||||
* the kernel memory protection.
|
||||
*/
|
||||
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
|
||||
{
|
||||
temp_mm_state_t temp_state;
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
switch_mm_irqs_off(NULL, mm, current);
|
||||
|
||||
/*
|
||||
* If breakpoints are enabled, disable them while the temporary mm is
|
||||
* used. Userspace might set up watchpoints on addresses that are used
|
||||
* in the temporary mm, which would lead to wrong signals being sent or
|
||||
* crashes.
|
||||
*
|
||||
* Note that breakpoints are not disabled selectively, which also causes
|
||||
* kernel breakpoints (e.g., perf's) to be disabled. This might be
|
||||
* undesirable, but still seems reasonable as the code that runs in the
|
||||
* temporary mm should be short.
|
||||
*/
|
||||
if (hw_breakpoint_active())
|
||||
hw_breakpoint_disable();
|
||||
|
||||
return temp_state;
|
||||
}
|
||||
|
||||
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
switch_mm_irqs_off(NULL, prev_state.mm, current);
|
||||
|
||||
/*
|
||||
* Restore the breakpoints if they were disabled before the temporary mm
|
||||
* was loaded.
|
||||
*/
|
||||
if (hw_breakpoint_active())
|
||||
hw_breakpoint_restore();
|
||||
}
|
||||
unsigned long __get_current_cr3_fast(void);
|
||||
|
||||
#endif /* _ASM_X86_MMU_CONTEXT_H */
|
||||
|
|
|
@ -47,7 +47,13 @@ static inline void slow_down_io(void)
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline void __flush_tlb(void)
|
||||
void native_flush_tlb_local(void);
|
||||
void native_flush_tlb_global(void);
|
||||
void native_flush_tlb_one_user(unsigned long addr);
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
static inline void __flush_tlb_local(void)
|
||||
{
|
||||
PVOP_VCALL0(mmu.flush_tlb_user);
|
||||
}
|
||||
|
@ -62,8 +68,8 @@ static inline void __flush_tlb_one_user(unsigned long addr)
|
|||
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
|
||||
}
|
||||
|
||||
static inline void flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
static inline void __flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ void sync_initial_page_table(void);
|
|||
#define kpte_clear_flush(ptep, vaddr) \
|
||||
do { \
|
||||
pte_clear(&init_mm, (vaddr), (ptep)); \
|
||||
__flush_tlb_one_kernel((vaddr)); \
|
||||
flush_tlb_one_kernel((vaddr)); \
|
||||
} while (0)
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
|
|
@ -471,9 +471,6 @@ static inline pteval_t pte_flags(pte_t pte)
|
|||
return native_pte_val(pte) & PTE_FLAGS_MASK;
|
||||
}
|
||||
|
||||
extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
|
||||
extern uint8_t __pte2cachemode_tbl[8];
|
||||
|
||||
#define __pte2cm_idx(cb) \
|
||||
((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \
|
||||
(((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \
|
||||
|
@ -483,43 +480,26 @@ extern uint8_t __pte2cachemode_tbl[8];
|
|||
(((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
|
||||
(((i) & 1) << _PAGE_BIT_PWT))
|
||||
|
||||
static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
|
||||
{
|
||||
if (likely(pcm == 0))
|
||||
return 0;
|
||||
return __cachemode2pte_tbl[pcm];
|
||||
}
|
||||
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
|
||||
{
|
||||
return __pgprot(cachemode2protval(pcm));
|
||||
}
|
||||
static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
|
||||
{
|
||||
unsigned long masked;
|
||||
unsigned long cachemode2protval(enum page_cache_mode pcm);
|
||||
|
||||
masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
|
||||
if (likely(masked == 0))
|
||||
return 0;
|
||||
return __pte2cachemode_tbl[__pte2cm_idx(masked)];
|
||||
static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
|
||||
{
|
||||
return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
|
||||
((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
|
||||
}
|
||||
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
|
||||
{
|
||||
pgprotval_t val = pgprot_val(pgprot);
|
||||
pgprot_t new;
|
||||
|
||||
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
|
||||
((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
|
||||
return new;
|
||||
return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
|
||||
}
|
||||
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
|
||||
{
|
||||
return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
|
||||
((val & _PAGE_PAT_LARGE) >>
|
||||
(_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
|
||||
}
|
||||
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
|
||||
{
|
||||
pgprotval_t val = pgprot_val(pgprot);
|
||||
pgprot_t new;
|
||||
|
||||
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
|
||||
((val & _PAGE_PAT_LARGE) >>
|
||||
(_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
|
||||
return new;
|
||||
return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -13,141 +13,52 @@
|
|||
#include <asm/pti.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
/*
|
||||
* The x86 feature is called PCID (Process Context IDentifier). It is similar
|
||||
* to what is traditionally called ASID on the RISC processors.
|
||||
*
|
||||
* We don't use the traditional ASID implementation, where each process/mm gets
|
||||
* its own ASID and flush/restart when we run out of ASID space.
|
||||
*
|
||||
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
|
||||
* that came by on this CPU, allowing cheaper switch_mm between processes on
|
||||
* this CPU.
|
||||
*
|
||||
* We end up with different spaces for different things. To avoid confusion we
|
||||
* use different names for each of them:
|
||||
*
|
||||
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
||||
* the canonical identifier for an mm
|
||||
*
|
||||
* kPCID - [1, TLB_NR_DYN_ASIDS]
|
||||
* the value we write into the PCID part of CR3; corresponds to the
|
||||
* ASID+1, because PCID 0 is special.
|
||||
*
|
||||
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
||||
* for KPTI each mm has two address spaces and thus needs two
|
||||
* PCID values, but we can still do with a single ASID denomination
|
||||
* for each mm. Corresponds to kPCID + 2048.
|
||||
*
|
||||
*/
|
||||
void __flush_tlb_all(void);
|
||||
|
||||
/* There are 12 bits of space for ASIDS in CR3 */
|
||||
#define CR3_HW_ASID_BITS 12
|
||||
#define TLB_FLUSH_ALL -1UL
|
||||
|
||||
/*
|
||||
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
|
||||
* user/kernel switches
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
# define PTI_CONSUMED_PCID_BITS 1
|
||||
#else
|
||||
# define PTI_CONSUMED_PCID_BITS 0
|
||||
#endif
|
||||
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
|
||||
unsigned long cr4_read_shadow(void);
|
||||
|
||||
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
|
||||
/* Set in this cpu's CR4. */
|
||||
static inline void cr4_set_bits_irqsoff(unsigned long mask)
|
||||
{
|
||||
cr4_update_irqsoff(mask, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
|
||||
* for them being zero-based. Another -1 is because PCID 0 is reserved for
|
||||
* use by non-PCID-aware users.
|
||||
*/
|
||||
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
|
||||
/* Clear in this cpu's CR4. */
|
||||
static inline void cr4_clear_bits_irqsoff(unsigned long mask)
|
||||
{
|
||||
cr4_update_irqsoff(0, mask);
|
||||
}
|
||||
|
||||
/* Set in this cpu's CR4. */
|
||||
static inline void cr4_set_bits(unsigned long mask)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
cr4_set_bits_irqsoff(mask);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/* Clear in this cpu's CR4. */
|
||||
static inline void cr4_clear_bits(unsigned long mask)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
cr4_clear_bits_irqsoff(mask);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
#ifndef MODULE
|
||||
/*
|
||||
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
|
||||
* lines.
|
||||
*/
|
||||
#define TLB_NR_DYN_ASIDS 6
|
||||
|
||||
/*
|
||||
* Given @asid, compute kPCID
|
||||
*/
|
||||
static inline u16 kern_pcid(u16 asid)
|
||||
{
|
||||
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Make sure that the dynamic ASID space does not confict with the
|
||||
* bit we are using to switch between user and kernel ASIDs.
|
||||
*/
|
||||
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
|
||||
|
||||
/*
|
||||
* The ASID being passed in here should have respected the
|
||||
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
|
||||
*/
|
||||
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
|
||||
#endif
|
||||
/*
|
||||
* The dynamically-assigned ASIDs that get passed in are small
|
||||
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
|
||||
* so do not bother to clear it.
|
||||
*
|
||||
* If PCID is on, ASID-aware code paths put the ASID+1 into the
|
||||
* PCID bits. This serves two purposes. It prevents a nasty
|
||||
* situation in which PCID-unaware code saves CR3, loads some other
|
||||
* value (with PCID == 0), and then restores CR3, thus corrupting
|
||||
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
|
||||
* that any bugs involving loading a PCID-enabled CR3 with
|
||||
* CR4.PCIDE off will trigger deterministically.
|
||||
*/
|
||||
return asid + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given @asid, compute uPCID
|
||||
*/
|
||||
static inline u16 user_pcid(u16 asid)
|
||||
{
|
||||
u16 ret = kern_pcid(asid);
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pgd_t;
|
||||
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
|
||||
{
|
||||
if (static_cpu_has(X86_FEATURE_PCID)) {
|
||||
return __sme_pa(pgd) | kern_pcid(asid);
|
||||
} else {
|
||||
VM_WARN_ON_ONCE(asid != 0);
|
||||
return __sme_pa(pgd);
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
|
||||
{
|
||||
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
||||
/*
|
||||
* Use boot_cpu_has() instead of this_cpu_has() as this function
|
||||
* might be called during early boot. This should work even after
|
||||
* boot because all CPU's the have same capabilities:
|
||||
*/
|
||||
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
|
||||
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
#include <asm/paravirt.h>
|
||||
#else
|
||||
#define __flush_tlb() __native_flush_tlb()
|
||||
#define __flush_tlb_global() __native_flush_tlb_global()
|
||||
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
|
||||
#endif
|
||||
|
||||
struct tlb_context {
|
||||
u64 ctx_id;
|
||||
u64 tlb_gen;
|
||||
|
@ -242,38 +153,7 @@ struct tlb_state {
|
|||
};
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
|
||||
|
||||
/*
|
||||
* Blindly accessing user memory from NMI context can be dangerous
|
||||
* if we're in the middle of switching the current user task or
|
||||
* switching the loaded mm. It can also be dangerous if we
|
||||
* interrupted some kernel code that was temporarily using a
|
||||
* different mm.
|
||||
*/
|
||||
static inline bool nmi_uaccess_okay(void)
|
||||
{
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
struct mm_struct *current_mm = current->mm;
|
||||
|
||||
VM_WARN_ON_ONCE(!loaded_mm);
|
||||
|
||||
/*
|
||||
* The condition we want to check is
|
||||
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
|
||||
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
|
||||
* is supposed to be reasonably fast.
|
||||
*
|
||||
* Instead, we check the almost equivalent but somewhat conservative
|
||||
* condition below, and we rely on the fact that switch_mm_irqs_off()
|
||||
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
|
||||
*/
|
||||
if (loaded_mm != current_mm)
|
||||
return false;
|
||||
|
||||
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool nmi_uaccess_okay(void);
|
||||
#define nmi_uaccess_okay nmi_uaccess_okay
|
||||
|
||||
/* Initialize cr4 shadow for this CPU. */
|
||||
|
@ -282,249 +162,11 @@ static inline void cr4_init_shadow(void)
|
|||
this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
|
||||
}
|
||||
|
||||
static inline void __cr4_set(unsigned long cr4)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
this_cpu_write(cpu_tlbstate.cr4, cr4);
|
||||
__write_cr4(cr4);
|
||||
}
|
||||
|
||||
/* Set in this cpu's CR4. */
|
||||
static inline void cr4_set_bits_irqsoff(unsigned long mask)
|
||||
{
|
||||
unsigned long cr4;
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
if ((cr4 | mask) != cr4)
|
||||
__cr4_set(cr4 | mask);
|
||||
}
|
||||
|
||||
/* Clear in this cpu's CR4. */
|
||||
static inline void cr4_clear_bits_irqsoff(unsigned long mask)
|
||||
{
|
||||
unsigned long cr4;
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
if ((cr4 & ~mask) != cr4)
|
||||
__cr4_set(cr4 & ~mask);
|
||||
}
|
||||
|
||||
/* Set in this cpu's CR4. */
|
||||
static inline void cr4_set_bits(unsigned long mask)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
cr4_set_bits_irqsoff(mask);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/* Clear in this cpu's CR4. */
|
||||
static inline void cr4_clear_bits(unsigned long mask)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
cr4_clear_bits_irqsoff(mask);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
|
||||
{
|
||||
unsigned long cr4;
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
__cr4_set(cr4 ^ mask);
|
||||
}
|
||||
|
||||
/* Read the CR4 shadow. */
|
||||
static inline unsigned long cr4_read_shadow(void)
|
||||
{
|
||||
return this_cpu_read(cpu_tlbstate.cr4);
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark all other ASIDs as invalid, preserves the current.
|
||||
*/
|
||||
static inline void invalidate_other_asid(void)
|
||||
{
|
||||
this_cpu_write(cpu_tlbstate.invalidate_other, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
|
||||
* enable and PPro Global page enable), so that any CPU's that boot
|
||||
* up after us can get the correct flags. This should only be used
|
||||
* during boot on the boot cpu.
|
||||
*/
|
||||
extern unsigned long mmu_cr4_features;
|
||||
extern u32 *trampoline_cr4_features;
|
||||
|
||||
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
|
||||
{
|
||||
mmu_cr4_features |= mask;
|
||||
if (trampoline_cr4_features)
|
||||
*trampoline_cr4_features = mmu_cr4_features;
|
||||
cr4_set_bits(mask);
|
||||
}
|
||||
|
||||
extern void initialize_tlbstate_and_flush(void);
|
||||
|
||||
/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
||||
* See SWITCH_TO_USER_CR3.
|
||||
*/
|
||||
static inline void invalidate_user_asid(u16 asid)
|
||||
{
|
||||
/* There is no user ASID if address space separation is off */
|
||||
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We only have a single ASID if PCID is off and the CR3
|
||||
* write will have flushed it.
|
||||
*/
|
||||
if (!cpu_feature_enabled(X86_FEATURE_PCID))
|
||||
return;
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
__set_bit(kern_pcid(asid),
|
||||
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
|
||||
}
|
||||
|
||||
/*
|
||||
* flush the entire current user mapping
|
||||
*/
|
||||
static inline void __native_flush_tlb(void)
|
||||
{
|
||||
/*
|
||||
* Preemption or interrupts must be disabled to protect the access
|
||||
* to the per CPU variable and to prevent being preempted between
|
||||
* read_cr3() and write_cr3().
|
||||
*/
|
||||
WARN_ON_ONCE(preemptible());
|
||||
|
||||
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
||||
|
||||
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
|
||||
native_write_cr3(__native_read_cr3());
|
||||
}
|
||||
|
||||
/*
|
||||
* flush everything
|
||||
*/
|
||||
static inline void __native_flush_tlb_global(void)
|
||||
{
|
||||
unsigned long cr4, flags;
|
||||
|
||||
if (static_cpu_has(X86_FEATURE_INVPCID)) {
|
||||
/*
|
||||
* Using INVPCID is considerably faster than a pair of writes
|
||||
* to CR4 sandwiched inside an IRQ flag save/restore.
|
||||
*
|
||||
* Note, this works with CR4.PCIDE=0 or 1.
|
||||
*/
|
||||
invpcid_flush_all();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read-modify-write to CR4 - protect it from preemption and
|
||||
* from interrupts. (Use the raw variant because this code can
|
||||
* be called from deep inside debugging code.)
|
||||
*/
|
||||
raw_local_irq_save(flags);
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
/* toggle PGE */
|
||||
native_write_cr4(cr4 ^ X86_CR4_PGE);
|
||||
/* write old PGE again and flush TLBs */
|
||||
native_write_cr4(cr4);
|
||||
|
||||
raw_local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* flush one page in the user mapping
|
||||
*/
|
||||
static inline void __native_flush_tlb_one_user(unsigned long addr)
|
||||
{
|
||||
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
|
||||
* Just use invalidate_user_asid() in case we are called early.
|
||||
*/
|
||||
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
|
||||
invalidate_user_asid(loaded_mm_asid);
|
||||
else
|
||||
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
|
||||
}
|
||||
|
||||
/*
|
||||
* flush everything
|
||||
*/
|
||||
static inline void __flush_tlb_all(void)
|
||||
{
|
||||
/*
|
||||
* This is to catch users with enabled preemption and the PGE feature
|
||||
* and don't trigger the warning in __native_flush_tlb().
|
||||
*/
|
||||
VM_WARN_ON_ONCE(preemptible());
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
__flush_tlb_global();
|
||||
} else {
|
||||
/*
|
||||
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
|
||||
*/
|
||||
__flush_tlb();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* flush one page in the kernel mapping
|
||||
*/
|
||||
static inline void __flush_tlb_one_kernel(unsigned long addr)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
|
||||
|
||||
/*
|
||||
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
|
||||
* paravirt equivalent. Even with PCID, this is sufficient: we only
|
||||
* use PCID if we also use global PTEs for the kernel mapping, and
|
||||
* INVLPG flushes global translations across all address spaces.
|
||||
*
|
||||
* If PTI is on, then the kernel is mapped with non-global PTEs, and
|
||||
* __flush_tlb_one_user() will flush the given address for the current
|
||||
* kernel address space and for its usermode counterpart, but it does
|
||||
* not flush it for other address spaces.
|
||||
*/
|
||||
__flush_tlb_one_user(addr);
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
/*
|
||||
* See above. We need to propagate the flush to all other address
|
||||
* spaces. In principle, we only need to propagate it to kernelmode
|
||||
* address spaces, but the extra bookkeeping we would need is not
|
||||
* worth it.
|
||||
*/
|
||||
invalidate_other_asid();
|
||||
}
|
||||
|
||||
#define TLB_FLUSH_ALL -1UL
|
||||
|
||||
/*
|
||||
* TLB flushing:
|
||||
*
|
||||
|
@ -563,7 +205,15 @@ struct flush_tlb_info {
|
|||
bool freed_tables;
|
||||
};
|
||||
|
||||
#define local_flush_tlb() __flush_tlb()
|
||||
void flush_tlb_local(void);
|
||||
void flush_tlb_one_user(unsigned long addr);
|
||||
void flush_tlb_one_kernel(unsigned long addr);
|
||||
void flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
#include <asm/paravirt.h>
|
||||
#endif
|
||||
|
||||
#define flush_tlb_mm(mm) \
|
||||
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
|
||||
|
@ -585,9 +235,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
|
|||
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
|
||||
{
|
||||
/*
|
||||
|
@ -608,12 +255,6 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
|
|||
|
||||
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
||||
|
||||
#ifndef CONFIG_PARAVIRT
|
||||
#define flush_tlb_others(mask, info) \
|
||||
native_flush_tlb_others(mask, info)
|
||||
|
||||
#define paravirt_tlb_remove_table(tlb, page) \
|
||||
tlb_remove_page(tlb, (void *)(page))
|
||||
#endif
|
||||
#endif /* !MODULE */
|
||||
|
||||
#endif /* _ASM_X86_TLBFLUSH_H */
|
||||
|
|
|
@ -8,6 +8,7 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
|
|||
|
||||
struct cpumask;
|
||||
struct mm_struct;
|
||||
struct flush_tlb_info;
|
||||
|
||||
#ifdef CONFIG_X86_UV
|
||||
#include <linux/efi.h>
|
||||
|
|
|
@ -783,6 +783,61 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
|
|||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
struct mm_struct *mm;
|
||||
} temp_mm_state_t;
|
||||
|
||||
/*
|
||||
* Using a temporary mm allows to set temporary mappings that are not accessible
|
||||
* by other CPUs. Such mappings are needed to perform sensitive memory writes
|
||||
* that override the kernel memory protections (e.g., W^X), without exposing the
|
||||
* temporary page-table mappings that are required for these write operations to
|
||||
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
|
||||
* mapping is torn down.
|
||||
*
|
||||
* Context: The temporary mm needs to be used exclusively by a single core. To
|
||||
* harden security IRQs must be disabled while the temporary mm is
|
||||
* loaded, thereby preventing interrupt handler bugs from overriding
|
||||
* the kernel memory protection.
|
||||
*/
|
||||
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
|
||||
{
|
||||
temp_mm_state_t temp_state;
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
switch_mm_irqs_off(NULL, mm, current);
|
||||
|
||||
/*
|
||||
* If breakpoints are enabled, disable them while the temporary mm is
|
||||
* used. Userspace might set up watchpoints on addresses that are used
|
||||
* in the temporary mm, which would lead to wrong signals being sent or
|
||||
* crashes.
|
||||
*
|
||||
* Note that breakpoints are not disabled selectively, which also causes
|
||||
* kernel breakpoints (e.g., perf's) to be disabled. This might be
|
||||
* undesirable, but still seems reasonable as the code that runs in the
|
||||
* temporary mm should be short.
|
||||
*/
|
||||
if (hw_breakpoint_active())
|
||||
hw_breakpoint_disable();
|
||||
|
||||
return temp_state;
|
||||
}
|
||||
|
||||
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
switch_mm_irqs_off(NULL, prev_state.mm, current);
|
||||
|
||||
/*
|
||||
* Restore the breakpoints if they were disabled before the temporary mm
|
||||
* was loaded.
|
||||
*/
|
||||
if (hw_breakpoint_active())
|
||||
hw_breakpoint_restore();
|
||||
}
|
||||
|
||||
__ro_after_init struct mm_struct *poking_mm;
|
||||
__ro_after_init unsigned long poking_addr;
|
||||
|
||||
|
|
|
@ -387,7 +387,30 @@ set_register:
|
|||
bits_missing);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(native_write_cr4);
|
||||
#if IS_MODULE(CONFIG_LKDTM)
|
||||
EXPORT_SYMBOL_GPL(native_write_cr4);
|
||||
#endif
|
||||
|
||||
void cr4_update_irqsoff(unsigned long set, unsigned long clear)
|
||||
{
|
||||
unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
newval = (cr4 & ~clear) | set;
|
||||
if (newval != cr4) {
|
||||
this_cpu_write(cpu_tlbstate.cr4, newval);
|
||||
__write_cr4(newval);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(cr4_update_irqsoff);
|
||||
|
||||
/* Read the CR4 shadow. */
|
||||
unsigned long cr4_read_shadow(void)
|
||||
{
|
||||
return this_cpu_read(cpu_tlbstate.cr4);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cr4_read_shadow);
|
||||
|
||||
void cr4_init(void)
|
||||
{
|
||||
|
|
|
@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
|
|||
|
||||
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
__flush_tlb();
|
||||
flush_tlb_local();
|
||||
|
||||
/* Save MTRR state */
|
||||
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
|
||||
|
@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock)
|
|||
{
|
||||
/* Flush TLBs (no need to flush caches - they are disabled) */
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
__flush_tlb();
|
||||
flush_tlb_local();
|
||||
|
||||
/* Intel (P6) standard MTRRs */
|
||||
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
|
||||
|
|
|
@ -160,25 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
|
|||
return insn_len;
|
||||
}
|
||||
|
||||
static void native_flush_tlb(void)
|
||||
{
|
||||
__native_flush_tlb();
|
||||
}
|
||||
|
||||
/*
|
||||
* Global pages have to be flushed a bit differently. Not a real
|
||||
* performance problem because this does not happen often.
|
||||
*/
|
||||
static void native_flush_tlb_global(void)
|
||||
{
|
||||
__native_flush_tlb_global();
|
||||
}
|
||||
|
||||
static void native_flush_tlb_one_user(unsigned long addr)
|
||||
{
|
||||
__native_flush_tlb_one_user(addr);
|
||||
}
|
||||
|
||||
struct static_key paravirt_steal_enabled;
|
||||
struct static_key paravirt_steal_rq_enabled;
|
||||
|
||||
|
@ -359,7 +340,7 @@ struct paravirt_patch_template pv_ops = {
|
|||
#endif /* CONFIG_PARAVIRT_XXL */
|
||||
|
||||
/* Mmu ops. */
|
||||
.mmu.flush_tlb_user = native_flush_tlb,
|
||||
.mmu.flush_tlb_user = native_flush_tlb_local,
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_others = native_flush_tlb_others,
|
||||
|
|
|
@ -612,6 +612,17 @@ void speculation_ctrl_update_current(void)
|
|||
preempt_enable();
|
||||
}
|
||||
|
||||
static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
|
||||
{
|
||||
unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
|
||||
newval = cr4 ^ mask;
|
||||
if (newval != cr4) {
|
||||
this_cpu_write(cpu_tlbstate.cr4, newval);
|
||||
__write_cr4(newval);
|
||||
}
|
||||
}
|
||||
|
||||
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
{
|
||||
unsigned long tifp, tifn;
|
||||
|
|
|
@ -49,7 +49,7 @@
|
|||
* Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
|
||||
* (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
|
||||
*/
|
||||
uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
|
||||
static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
|
||||
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
|
||||
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
|
||||
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
|
||||
|
@ -57,9 +57,16 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
|
|||
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
|
||||
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
|
||||
};
|
||||
EXPORT_SYMBOL(__cachemode2pte_tbl);
|
||||
|
||||
uint8_t __pte2cachemode_tbl[8] = {
|
||||
unsigned long cachemode2protval(enum page_cache_mode pcm)
|
||||
{
|
||||
if (likely(pcm == 0))
|
||||
return 0;
|
||||
return __cachemode2pte_tbl[pcm];
|
||||
}
|
||||
EXPORT_SYMBOL(cachemode2protval);
|
||||
|
||||
static uint8_t __pte2cachemode_tbl[8] = {
|
||||
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
|
||||
[__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
|
||||
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
|
||||
|
@ -69,7 +76,22 @@ uint8_t __pte2cachemode_tbl[8] = {
|
|||
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
|
||||
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
|
||||
};
|
||||
EXPORT_SYMBOL(__pte2cachemode_tbl);
|
||||
|
||||
/* Check that the write-protect PAT entry is set for write-protect */
|
||||
bool x86_has_pat_wp(void)
|
||||
{
|
||||
return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP;
|
||||
}
|
||||
|
||||
enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
|
||||
{
|
||||
unsigned long masked;
|
||||
|
||||
masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
|
||||
if (likely(masked == 0))
|
||||
return 0;
|
||||
return __pte2cachemode_tbl[__pte2cm_idx(masked)];
|
||||
}
|
||||
|
||||
static unsigned long __initdata pgt_buf_start;
|
||||
static unsigned long __initdata pgt_buf_end;
|
||||
|
@ -170,6 +192,19 @@ struct map_range {
|
|||
|
||||
static int page_size_mask;
|
||||
|
||||
/*
|
||||
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
|
||||
* enable and PPro Global page enable), so that any CPU's that boot
|
||||
* up after us can get the correct flags. Invoked on the boot CPU.
|
||||
*/
|
||||
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
|
||||
{
|
||||
mmu_cr4_features |= mask;
|
||||
if (trampoline_cr4_features)
|
||||
*trampoline_cr4_features = mmu_cr4_features;
|
||||
cr4_set_bits(mask);
|
||||
}
|
||||
|
||||
static void __init probe_page_size_mask(void)
|
||||
{
|
||||
/*
|
||||
|
@ -955,7 +990,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
|||
.next_asid = 1,
|
||||
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
|
||||
};
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
|
||||
|
||||
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
|
||||
{
|
||||
|
|
|
@ -304,7 +304,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
|
|||
* It's enough to flush this one mapping.
|
||||
* (PGE mappings get flushed as well)
|
||||
*/
|
||||
__flush_tlb_one_kernel(vaddr);
|
||||
flush_tlb_one_kernel(vaddr);
|
||||
}
|
||||
|
||||
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
|
||||
|
@ -373,7 +373,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
|
|||
pgprot_t prot;
|
||||
|
||||
pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
|
||||
pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
|
||||
protval_4k_2_large(cachemode2protval(cache));
|
||||
BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
|
||||
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
|
||||
pgd = pgd_offset_k((unsigned long)__va(phys));
|
||||
|
|
|
@ -778,10 +778,8 @@ void __init *early_memremap_encrypted(resource_size_t phys_addr,
|
|||
void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
|
||||
unsigned long size)
|
||||
{
|
||||
/* Be sure the write-protect PAT entry is set for write-protect */
|
||||
if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
|
||||
if (!x86_has_pat_wp())
|
||||
return NULL;
|
||||
|
||||
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
|
||||
}
|
||||
|
||||
|
@ -799,10 +797,8 @@ void __init *early_memremap_decrypted(resource_size_t phys_addr,
|
|||
void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
|
||||
unsigned long size)
|
||||
{
|
||||
/* Be sure the write-protect PAT entry is set for write-protect */
|
||||
if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
|
||||
if (!x86_has_pat_wp())
|
||||
return NULL;
|
||||
|
||||
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
|
||||
}
|
||||
#endif /* CONFIG_AMD_MEM_ENCRYPT */
|
||||
|
@ -889,5 +885,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx,
|
|||
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
|
||||
else
|
||||
pte_clear(&init_mm, addr, pte);
|
||||
__flush_tlb_one_kernel(addr);
|
||||
flush_tlb_one_kernel(addr);
|
||||
}
|
||||
|
|
|
@ -173,7 +173,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
|
|||
return -1;
|
||||
}
|
||||
|
||||
__flush_tlb_one_kernel(f->addr);
|
||||
flush_tlb_one_kernel(f->addr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -134,7 +134,7 @@ static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
|
|||
size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
|
||||
} while (size);
|
||||
|
||||
__native_flush_tlb();
|
||||
flush_tlb_local();
|
||||
}
|
||||
|
||||
void __init sme_unmap_bootdata(char *real_mode_data)
|
||||
|
|
|
@ -69,6 +69,11 @@ static DEFINE_SPINLOCK(cpa_lock);
|
|||
#define CPA_PAGES_ARRAY 4
|
||||
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
|
||||
|
||||
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
|
||||
{
|
||||
return __pgprot(cachemode2protval(pcm));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
static unsigned long direct_pages_count[PG_LEVEL_NUM];
|
||||
|
||||
|
@ -341,7 +346,7 @@ static void __cpa_flush_tlb(void *data)
|
|||
unsigned int i;
|
||||
|
||||
for (i = 0; i < cpa->numpages; i++)
|
||||
__flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
|
||||
flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
|
||||
}
|
||||
|
||||
static void cpa_flush(struct cpa_data *data, int cache)
|
||||
|
|
|
@ -19,6 +19,14 @@ EXPORT_SYMBOL(physical_mask);
|
|||
#define PGTABLE_HIGHMEM 0
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_PARAVIRT
|
||||
static inline
|
||||
void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
{
|
||||
tlb_remove_page(tlb, table);
|
||||
}
|
||||
#endif
|
||||
|
||||
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
|
||||
|
||||
pgtable_t pte_alloc_one(struct mm_struct *mm)
|
||||
|
@ -706,11 +714,9 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
|
|||
if (pud_present(*pud) && !pud_huge(*pud))
|
||||
return 0;
|
||||
|
||||
prot = pgprot_4k_2_large(prot);
|
||||
|
||||
set_pte((pte_t *)pud, pfn_pte(
|
||||
(u64)addr >> PAGE_SHIFT,
|
||||
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
|
||||
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
@ -738,11 +744,9 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
|
|||
if (pmd_present(*pmd) && !pmd_huge(*pmd))
|
||||
return 0;
|
||||
|
||||
prot = pgprot_4k_2_large(prot);
|
||||
|
||||
set_pte((pte_t *)pmd, pfn_pte(
|
||||
(u64)addr >> PAGE_SHIFT,
|
||||
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
|
||||
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -64,7 +64,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
|
|||
* It's enough to flush this one mapping.
|
||||
* (PGE mappings get flushed as well)
|
||||
*/
|
||||
__flush_tlb_one_kernel(vaddr);
|
||||
flush_tlb_one_kernel(vaddr);
|
||||
}
|
||||
|
||||
unsigned long __FIXADDR_TOP = 0xfffff000;
|
||||
|
|
|
@ -18,6 +18,16 @@
|
|||
|
||||
#include "mm_internal.h"
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
# define STATIC_NOPV
|
||||
#else
|
||||
# define STATIC_NOPV static
|
||||
# define __flush_tlb_local native_flush_tlb_local
|
||||
# define __flush_tlb_global native_flush_tlb_global
|
||||
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
|
||||
# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* TLB flushing, formerly SMP-only
|
||||
* c/o Linus Torvalds.
|
||||
|
@ -38,6 +48,126 @@
|
|||
*/
|
||||
#define LAST_USER_MM_IBPB 0x1UL
|
||||
|
||||
/*
|
||||
* The x86 feature is called PCID (Process Context IDentifier). It is similar
|
||||
* to what is traditionally called ASID on the RISC processors.
|
||||
*
|
||||
* We don't use the traditional ASID implementation, where each process/mm gets
|
||||
* its own ASID and flush/restart when we run out of ASID space.
|
||||
*
|
||||
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
|
||||
* that came by on this CPU, allowing cheaper switch_mm between processes on
|
||||
* this CPU.
|
||||
*
|
||||
* We end up with different spaces for different things. To avoid confusion we
|
||||
* use different names for each of them:
|
||||
*
|
||||
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
||||
* the canonical identifier for an mm
|
||||
*
|
||||
* kPCID - [1, TLB_NR_DYN_ASIDS]
|
||||
* the value we write into the PCID part of CR3; corresponds to the
|
||||
* ASID+1, because PCID 0 is special.
|
||||
*
|
||||
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
||||
* for KPTI each mm has two address spaces and thus needs two
|
||||
* PCID values, but we can still do with a single ASID denomination
|
||||
* for each mm. Corresponds to kPCID + 2048.
|
||||
*
|
||||
*/
|
||||
|
||||
/* There are 12 bits of space for ASIDS in CR3 */
|
||||
#define CR3_HW_ASID_BITS 12
|
||||
|
||||
/*
|
||||
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
|
||||
* user/kernel switches
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
# define PTI_CONSUMED_PCID_BITS 1
|
||||
#else
|
||||
# define PTI_CONSUMED_PCID_BITS 0
|
||||
#endif
|
||||
|
||||
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
|
||||
|
||||
/*
|
||||
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
|
||||
* for them being zero-based. Another -1 is because PCID 0 is reserved for
|
||||
* use by non-PCID-aware users.
|
||||
*/
|
||||
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
|
||||
|
||||
/*
|
||||
* Given @asid, compute kPCID
|
||||
*/
|
||||
static inline u16 kern_pcid(u16 asid)
|
||||
{
|
||||
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Make sure that the dynamic ASID space does not confict with the
|
||||
* bit we are using to switch between user and kernel ASIDs.
|
||||
*/
|
||||
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
|
||||
|
||||
/*
|
||||
* The ASID being passed in here should have respected the
|
||||
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
|
||||
*/
|
||||
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
|
||||
#endif
|
||||
/*
|
||||
* The dynamically-assigned ASIDs that get passed in are small
|
||||
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
|
||||
* so do not bother to clear it.
|
||||
*
|
||||
* If PCID is on, ASID-aware code paths put the ASID+1 into the
|
||||
* PCID bits. This serves two purposes. It prevents a nasty
|
||||
* situation in which PCID-unaware code saves CR3, loads some other
|
||||
* value (with PCID == 0), and then restores CR3, thus corrupting
|
||||
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
|
||||
* that any bugs involving loading a PCID-enabled CR3 with
|
||||
* CR4.PCIDE off will trigger deterministically.
|
||||
*/
|
||||
return asid + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given @asid, compute uPCID
|
||||
*/
|
||||
static inline u16 user_pcid(u16 asid)
|
||||
{
|
||||
u16 ret = kern_pcid(asid);
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
|
||||
{
|
||||
if (static_cpu_has(X86_FEATURE_PCID)) {
|
||||
return __sme_pa(pgd) | kern_pcid(asid);
|
||||
} else {
|
||||
VM_WARN_ON_ONCE(asid != 0);
|
||||
return __sme_pa(pgd);
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
|
||||
{
|
||||
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
||||
/*
|
||||
* Use boot_cpu_has() instead of this_cpu_has() as this function
|
||||
* might be called during early boot. This should work even after
|
||||
* boot because all CPU's the have same capabilities:
|
||||
*/
|
||||
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
|
||||
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
|
||||
}
|
||||
|
||||
/*
|
||||
* We get here when we do something requiring a TLB invalidation
|
||||
* but could not go invalidate all of the contexts. We do the
|
||||
|
@ -110,6 +240,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
|||
*need_flush = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
||||
* See SWITCH_TO_USER_CR3.
|
||||
*/
|
||||
static inline void invalidate_user_asid(u16 asid)
|
||||
{
|
||||
/* There is no user ASID if address space separation is off */
|
||||
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We only have a single ASID if PCID is off and the CR3
|
||||
* write will have flushed it.
|
||||
*/
|
||||
if (!cpu_feature_enabled(X86_FEATURE_PCID))
|
||||
return;
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
__set_bit(kern_pcid(asid),
|
||||
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
|
||||
}
|
||||
|
||||
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
|
||||
{
|
||||
unsigned long new_mm_cr3;
|
||||
|
@ -244,6 +400,26 @@ static void cond_ibpb(struct task_struct *next)
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
static inline void cr4_update_pce_mm(struct mm_struct *mm)
|
||||
{
|
||||
if (static_branch_unlikely(&rdpmc_always_available_key) ||
|
||||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
|
||||
atomic_read(&mm->context.perf_rdpmc_allowed)))
|
||||
cr4_set_bits_irqsoff(X86_CR4_PCE);
|
||||
else
|
||||
cr4_clear_bits_irqsoff(X86_CR4_PCE);
|
||||
}
|
||||
|
||||
void cr4_update_pce(void *ignored)
|
||||
{
|
||||
cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
|
||||
}
|
||||
|
||||
#else
|
||||
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
|
||||
#endif
|
||||
|
||||
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
|
@ -403,7 +579,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
||||
|
||||
if (next != real_prev) {
|
||||
load_mm_cr4_irqsoff(next);
|
||||
cr4_update_pce_mm(next);
|
||||
switch_ldt(real_prev, next);
|
||||
}
|
||||
}
|
||||
|
@ -580,7 +756,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|||
unsigned long addr = f->start;
|
||||
|
||||
while (addr < f->end) {
|
||||
__flush_tlb_one_user(addr);
|
||||
flush_tlb_one_user(addr);
|
||||
addr += 1UL << f->stride_shift;
|
||||
}
|
||||
if (local)
|
||||
|
@ -588,7 +764,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|||
trace_tlb_flush(reason, nr_invalidate);
|
||||
} else {
|
||||
/* Full flush. */
|
||||
local_flush_tlb();
|
||||
flush_tlb_local();
|
||||
if (local)
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
trace_tlb_flush(reason, TLB_FLUSH_ALL);
|
||||
|
@ -623,8 +799,8 @@ static bool tlb_is_not_lazy(int cpu, void *data)
|
|||
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
if (info->end == TLB_FLUSH_ALL)
|
||||
|
@ -674,6 +850,12 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|||
(void *)info, 1, cpumask);
|
||||
}
|
||||
|
||||
void flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
__flush_tlb_others(cpumask, info);
|
||||
}
|
||||
|
||||
/*
|
||||
* See Documentation/x86/tlb.rst for details. We choose 33
|
||||
* because it is large enough to cover the vast majority (at
|
||||
|
@ -784,7 +966,7 @@ static void do_kernel_range_flush(void *info)
|
|||
|
||||
/* flush range by one by one 'invlpg' */
|
||||
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
|
||||
__flush_tlb_one_kernel(addr);
|
||||
flush_tlb_one_kernel(addr);
|
||||
}
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
|
@ -806,6 +988,164 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This can be used from process context to figure out what the value of
|
||||
* CR3 is without needing to do a (slow) __read_cr3().
|
||||
*
|
||||
* It's intended to be used for code like KVM that sneakily changes CR3
|
||||
* and needs to restore it. It needs to be used very carefully.
|
||||
*/
|
||||
unsigned long __get_current_cr3_fast(void)
|
||||
{
|
||||
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
|
||||
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
||||
|
||||
/* For now, be very restrictive about when this can be called. */
|
||||
VM_WARN_ON(in_nmi() || preemptible());
|
||||
|
||||
VM_BUG_ON(cr3 != __read_cr3());
|
||||
return cr3;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
|
||||
|
||||
/*
|
||||
* Flush one page in the kernel mapping
|
||||
*/
|
||||
void flush_tlb_one_kernel(unsigned long addr)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
|
||||
|
||||
/*
|
||||
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
|
||||
* paravirt equivalent. Even with PCID, this is sufficient: we only
|
||||
* use PCID if we also use global PTEs for the kernel mapping, and
|
||||
* INVLPG flushes global translations across all address spaces.
|
||||
*
|
||||
* If PTI is on, then the kernel is mapped with non-global PTEs, and
|
||||
* __flush_tlb_one_user() will flush the given address for the current
|
||||
* kernel address space and for its usermode counterpart, but it does
|
||||
* not flush it for other address spaces.
|
||||
*/
|
||||
flush_tlb_one_user(addr);
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
/*
|
||||
* See above. We need to propagate the flush to all other address
|
||||
* spaces. In principle, we only need to propagate it to kernelmode
|
||||
* address spaces, but the extra bookkeeping we would need is not
|
||||
* worth it.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.invalidate_other, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush one page in the user mapping
|
||||
*/
|
||||
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
|
||||
{
|
||||
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
|
||||
* Just use invalidate_user_asid() in case we are called early.
|
||||
*/
|
||||
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
|
||||
invalidate_user_asid(loaded_mm_asid);
|
||||
else
|
||||
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
|
||||
}
|
||||
|
||||
void flush_tlb_one_user(unsigned long addr)
|
||||
{
|
||||
__flush_tlb_one_user(addr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush everything
|
||||
*/
|
||||
STATIC_NOPV void native_flush_tlb_global(void)
|
||||
{
|
||||
unsigned long cr4, flags;
|
||||
|
||||
if (static_cpu_has(X86_FEATURE_INVPCID)) {
|
||||
/*
|
||||
* Using INVPCID is considerably faster than a pair of writes
|
||||
* to CR4 sandwiched inside an IRQ flag save/restore.
|
||||
*
|
||||
* Note, this works with CR4.PCIDE=0 or 1.
|
||||
*/
|
||||
invpcid_flush_all();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read-modify-write to CR4 - protect it from preemption and
|
||||
* from interrupts. (Use the raw variant because this code can
|
||||
* be called from deep inside debugging code.)
|
||||
*/
|
||||
raw_local_irq_save(flags);
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
/* toggle PGE */
|
||||
native_write_cr4(cr4 ^ X86_CR4_PGE);
|
||||
/* write old PGE again and flush TLBs */
|
||||
native_write_cr4(cr4);
|
||||
|
||||
raw_local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush the entire current user mapping
|
||||
*/
|
||||
STATIC_NOPV void native_flush_tlb_local(void)
|
||||
{
|
||||
/*
|
||||
* Preemption or interrupts must be disabled to protect the access
|
||||
* to the per CPU variable and to prevent being preempted between
|
||||
* read_cr3() and write_cr3().
|
||||
*/
|
||||
WARN_ON_ONCE(preemptible());
|
||||
|
||||
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
||||
|
||||
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
|
||||
native_write_cr3(__native_read_cr3());
|
||||
}
|
||||
|
||||
void flush_tlb_local(void)
|
||||
{
|
||||
__flush_tlb_local();
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush everything
|
||||
*/
|
||||
void __flush_tlb_all(void)
|
||||
{
|
||||
/*
|
||||
* This is to catch users with enabled preemption and the PGE feature
|
||||
* and don't trigger the warning in __native_flush_tlb().
|
||||
*/
|
||||
VM_WARN_ON_ONCE(preemptible());
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
__flush_tlb_global();
|
||||
} else {
|
||||
/*
|
||||
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
|
||||
*/
|
||||
flush_tlb_local();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__flush_tlb_all);
|
||||
|
||||
/*
|
||||
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
|
||||
* This means that the 'struct flush_tlb_info' that describes which mappings to
|
||||
|
@ -837,6 +1177,38 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
|||
put_cpu();
|
||||
}
|
||||
|
||||
/*
|
||||
* Blindly accessing user memory from NMI context can be dangerous
|
||||
* if we're in the middle of switching the current user task or
|
||||
* switching the loaded mm. It can also be dangerous if we
|
||||
* interrupted some kernel code that was temporarily using a
|
||||
* different mm.
|
||||
*/
|
||||
bool nmi_uaccess_okay(void)
|
||||
{
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
struct mm_struct *current_mm = current->mm;
|
||||
|
||||
VM_WARN_ON_ONCE(!loaded_mm);
|
||||
|
||||
/*
|
||||
* The condition we want to check is
|
||||
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
|
||||
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
|
||||
* is supposed to be reasonably fast.
|
||||
*
|
||||
* Instead, we check the almost equivalent but somewhat conservative
|
||||
* condition below, and we rely on the fact that switch_mm_irqs_off()
|
||||
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
|
||||
*/
|
||||
if (loaded_mm != current_mm)
|
||||
return false;
|
||||
|
||||
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
|
|
|
@ -293,10 +293,10 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
|
|||
* This must be a normal message, or retry of a normal message
|
||||
*/
|
||||
if (msg->address == TLB_FLUSH_ALL) {
|
||||
local_flush_tlb();
|
||||
flush_tlb_local();
|
||||
stat->d_alltlb++;
|
||||
} else {
|
||||
__flush_tlb_one_user(msg->address);
|
||||
flush_tlb_one_user(msg->address);
|
||||
stat->d_onetlb++;
|
||||
}
|
||||
stat->d_requestee++;
|
||||
|
|
|
@ -27,7 +27,6 @@
|
|||
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/xen/hypervisor.h>
|
||||
#include <asm/xen/hypercall.h>
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче