From a563f7598198b8389e00451ef6f3f1c12efbfb99 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 4 Apr 2016 11:43:15 +0100 Subject: [PATCH 01/29] arm64: Reuse TCR field definitions for EL1 and EL2 TCR_EL1, TCR_EL2 and VTCR_EL2, all share some field positions (TG0, ORGN0, IRGN0 and SH0) and their corresponding value definitions. This patch makes the TCR_EL1 definitions reusable and uses them for TCR_EL2 and VTCR_EL2 fields. This also fixes a bug where we assume TG0 in {V}TCR_EL2 is 1bit field. Cc: Catalin Marinas Cc: Mark Rutland Acked-by: Marc Zyngier Acked-by: Will Deacon Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/kvm_arm.h | 48 ++++++++-------- arch/arm64/include/asm/pgtable-hwdef.h | 80 ++++++++++++++++++++------ 2 files changed, 88 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 3f29887995bc..a46b39f62426 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -96,32 +96,34 @@ SCTLR_EL2_SA | SCTLR_EL2_I) /* TCR_EL2 Registers bits */ -#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) -#define TCR_EL2_TBI (1 << 20) -#define TCR_EL2_PS (7 << 16) -#define TCR_EL2_PS_40B (2 << 16) -#define TCR_EL2_TG0 (1 << 14) -#define TCR_EL2_SH0 (3 << 12) -#define TCR_EL2_ORGN0 (3 << 10) -#define TCR_EL2_IRGN0 (3 << 8) -#define TCR_EL2_T0SZ 0x3f -#define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ - TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) +#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) +#define TCR_EL2_TBI (1 << 20) +#define TCR_EL2_PS_SHIFT 16 +#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) +#define TCR_EL2_PS_40B (2 << TCR_EL2_PS_SHIFT) +#define TCR_EL2_TG0_MASK TCR_TG0_MASK +#define TCR_EL2_SH0_MASK TCR_SH0_MASK +#define TCR_EL2_ORGN0_MASK TCR_ORGN0_MASK +#define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK +#define TCR_EL2_T0SZ_MASK 0x3f +#define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ #define VTCR_EL2_RES1 (1 << 31) -#define VTCR_EL2_PS_MASK (7 << 16) -#define VTCR_EL2_TG0_MASK (1 << 14) -#define VTCR_EL2_TG0_4K (0 << 14) -#define VTCR_EL2_TG0_64K (1 << 14) -#define VTCR_EL2_SH0_MASK (3 << 12) -#define VTCR_EL2_SH0_INNER (3 << 12) -#define VTCR_EL2_ORGN0_MASK (3 << 10) -#define VTCR_EL2_ORGN0_WBWA (1 << 10) -#define VTCR_EL2_IRGN0_MASK (3 << 8) -#define VTCR_EL2_IRGN0_WBWA (1 << 8) -#define VTCR_EL2_SL0_MASK (3 << 6) -#define VTCR_EL2_SL0_LVL1 (1 << 6) +#define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK +#define VTCR_EL2_TG0_MASK TCR_TG0_MASK +#define VTCR_EL2_TG0_4K TCR_TG0_4K +#define VTCR_EL2_TG0_64K TCR_TG0_64K +#define VTCR_EL2_SH0_MASK TCR_SH0_MASK +#define VTCR_EL2_SH0_INNER TCR_SH0_INNER +#define VTCR_EL2_ORGN0_MASK TCR_ORGN0_MASK +#define VTCR_EL2_ORGN0_WBWA TCR_ORGN0_WBWA +#define VTCR_EL2_IRGN0_MASK TCR_IRGN0_MASK +#define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA +#define VTCR_EL2_SL0_SHIFT 6 +#define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f #define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5c25b831273d..936f1732727c 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -208,23 +208,69 @@ #define TCR_T1SZ(x) ((UL(64) - (x)) << TCR_T1SZ_OFFSET) #define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x)) #define TCR_TxSZ_WIDTH 6 -#define TCR_IRGN_NC ((UL(0) << 8) | (UL(0) << 24)) -#define TCR_IRGN_WBWA ((UL(1) << 8) | (UL(1) << 24)) -#define TCR_IRGN_WT ((UL(2) << 8) | (UL(2) << 24)) -#define TCR_IRGN_WBnWA ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_IRGN_MASK ((UL(3) << 8) | (UL(3) << 24)) -#define TCR_ORGN_NC ((UL(0) << 10) | (UL(0) << 26)) -#define TCR_ORGN_WBWA ((UL(1) << 10) | (UL(1) << 26)) -#define TCR_ORGN_WT ((UL(2) << 10) | (UL(2) << 26)) -#define TCR_ORGN_WBnWA ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_ORGN_MASK ((UL(3) << 10) | (UL(3) << 26)) -#define TCR_SHARED ((UL(3) << 12) | (UL(3) << 28)) -#define TCR_TG0_4K (UL(0) << 14) -#define TCR_TG0_64K (UL(1) << 14) -#define TCR_TG0_16K (UL(2) << 14) -#define TCR_TG1_16K (UL(1) << 30) -#define TCR_TG1_4K (UL(2) << 30) -#define TCR_TG1_64K (UL(3) << 30) + +#define TCR_IRGN0_SHIFT 8 +#define TCR_IRGN0_MASK (UL(3) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_NC (UL(0) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBWA (UL(1) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WT (UL(2) << TCR_IRGN0_SHIFT) +#define TCR_IRGN0_WBnWA (UL(3) << TCR_IRGN0_SHIFT) + +#define TCR_IRGN1_SHIFT 24 +#define TCR_IRGN1_MASK (UL(3) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_NC (UL(0) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WBWA (UL(1) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WT (UL(2) << TCR_IRGN1_SHIFT) +#define TCR_IRGN1_WBnWA (UL(3) << TCR_IRGN1_SHIFT) + +#define TCR_IRGN_NC (TCR_IRGN0_NC | TCR_IRGN1_NC) +#define TCR_IRGN_WBWA (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA) +#define TCR_IRGN_WT (TCR_IRGN0_WT | TCR_IRGN1_WT) +#define TCR_IRGN_WBnWA (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA) +#define TCR_IRGN_MASK (TCR_IRGN0_MASK | TCR_IRGN1_MASK) + + +#define TCR_ORGN0_SHIFT 10 +#define TCR_ORGN0_MASK (UL(3) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_NC (UL(0) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBWA (UL(1) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WT (UL(2) << TCR_ORGN0_SHIFT) +#define TCR_ORGN0_WBnWA (UL(3) << TCR_ORGN0_SHIFT) + +#define TCR_ORGN1_SHIFT 26 +#define TCR_ORGN1_MASK (UL(3) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_NC (UL(0) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WBWA (UL(1) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WT (UL(2) << TCR_ORGN1_SHIFT) +#define TCR_ORGN1_WBnWA (UL(3) << TCR_ORGN1_SHIFT) + +#define TCR_ORGN_NC (TCR_ORGN0_NC | TCR_ORGN1_NC) +#define TCR_ORGN_WBWA (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA) +#define TCR_ORGN_WT (TCR_ORGN0_WT | TCR_ORGN1_WT) +#define TCR_ORGN_WBnWA (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA) +#define TCR_ORGN_MASK (TCR_ORGN0_MASK | TCR_ORGN1_MASK) + +#define TCR_SH0_SHIFT 12 +#define TCR_SH0_MASK (UL(3) << TCR_SH0_SHIFT) +#define TCR_SH0_INNER (UL(3) << TCR_SH0_SHIFT) + +#define TCR_SH1_SHIFT 28 +#define TCR_SH1_MASK (UL(3) << TCR_SH1_SHIFT) +#define TCR_SH1_INNER (UL(3) << TCR_SH1_SHIFT) +#define TCR_SHARED (TCR_SH0_INNER | TCR_SH1_INNER) + +#define TCR_TG0_SHIFT 14 +#define TCR_TG0_MASK (UL(3) << TCR_TG0_SHIFT) +#define TCR_TG0_4K (UL(0) << TCR_TG0_SHIFT) +#define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) +#define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) + +#define TCR_TG1_SHIFT 30 +#define TCR_TG1_MASK (UL(3) << TCR_TG1_SHIFT) +#define TCR_TG1_16K (UL(1) << TCR_TG1_SHIFT) +#define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) +#define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) + #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) From acd05010400215b281a9197a889bec3e67998654 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Mon, 4 Apr 2016 11:53:52 +0100 Subject: [PATCH 02/29] arm64: Cleanup VTCR_EL2 and VTTBR field values We share most of the bits for VTCR_EL2 for different page sizes, except for the TG0 value and the entry level value. This patch makes the definitions a bit more cleaner to reflect this fact. Also cleans up the VTTBR_X calculation. No functional changes. Cc: Marc Zyngier Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/kvm_arm.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a46b39f62426..1281d98392a0 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -144,30 +144,32 @@ * The magic numbers used for VTTBR_X in this patch can be found in Tables * D4-23 and D4-25 in ARM DDI 0487A.b. */ + +#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B +#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ + VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) + #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: - * 40bits input (T0SZ = 24) * 64kB pages (TG0 = 1) * 2 level page tables (SL = 1) */ -#define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ - VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1) -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 38 #else /* * Stage2 translation configuration: - * 40bits input (T0SZ = 24) * 4kB pages (TG0 = 0) * 3 level page tables (SL = 1) */ -#define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ - VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_RES1) -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 37 #endif +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) +#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) + #define VTTBR_BADDR_SHIFT (VTTBR_X - 1) #define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) #define VTTBR_VMID_SHIFT (UL(48)) From 120f0779c3ed89c25ef1db943feac8ed73a0d7f9 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 1 Mar 2016 10:03:06 +0000 Subject: [PATCH 03/29] kvm arm: Move fake PGD handling to arch specific files Rearrange the code for fake pgd handling, which is applicable only for arm64. This will later be removed once we introduce the stage2 page table walker macros. Reviewed-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/include/asm/kvm_mmu.h | 11 ++++++-- arch/arm/kvm/mmu.c | 47 +++++--------------------------- arch/arm64/include/asm/kvm_mmu.h | 43 +++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 42 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index da44be9db4fa..c2b2b27b7da1 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -161,8 +161,6 @@ static inline bool kvm_page_empty(void *ptr) #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) #define kvm_pud_table_empty(kvm, pudp) (0) -#define KVM_PREALLOC_LEVEL 0 - static inline void *kvm_get_hwpgd(struct kvm *kvm) { return kvm->arch.pgd; @@ -173,6 +171,15 @@ static inline unsigned int kvm_get_hwpgd_size(void) return PTRS_PER_S2_PGD * sizeof(pgd_t); } +static inline pgd_t *kvm_setup_fake_pgd(pgd_t *hwpgd) +{ + return hwpgd; +} + +static inline void kvm_free_fake_pgd(pgd_t *pgd) +{ +} + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 58dbd5c439df..774d00b8066b 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -684,47 +684,16 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!hwpgd) return -ENOMEM; - /* When the kernel uses more levels of page tables than the + /* + * When the kernel uses more levels of page tables than the * guest, we allocate a fake PGD and pre-populate it to point * to the next-level page table, which will be the real * initial page table pointed to by the VTTBR. - * - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for - * the PMD and the kernel will use folded pud. - * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD - * pages. */ - if (KVM_PREALLOC_LEVEL > 0) { - int i; - - /* - * Allocate fake pgd for the page table manipulation macros to - * work. This is not used by the hardware and we have no - * alignment requirement for this allocation. - */ - pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), - GFP_KERNEL | __GFP_ZERO); - - if (!pgd) { - kvm_free_hwpgd(hwpgd); - return -ENOMEM; - } - - /* Plug the HW PGD into the fake one. */ - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - } else { - /* - * Allocate actual first-level Stage-2 page table used by the - * hardware for Stage-2 page table walks. - */ - pgd = (pgd_t *)hwpgd; + pgd = kvm_setup_fake_pgd(hwpgd); + if (IS_ERR(pgd)) { + kvm_free_hwpgd(hwpgd); + return PTR_ERR(pgd); } kvm_clean_pgd(pgd); @@ -831,9 +800,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); kvm_free_hwpgd(kvm_get_hwpgd(kvm)); - if (KVM_PREALLOC_LEVEL > 0) - kfree(kvm->arch.pgd); - + kvm_free_fake_pgd(kvm->arch.pgd); kvm->arch.pgd = NULL; } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 22732a5e3119..9a3409f7b37a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -208,6 +208,49 @@ static inline unsigned int kvm_get_hwpgd_size(void) return PTRS_PER_S2_PGD * sizeof(pgd_t); } +/* + * Allocate fake pgd for the host kernel page table macros to work. + * This is not used by the hardware and we have no alignment + * requirement for this allocation. + */ +static inline pgd_t *kvm_setup_fake_pgd(pgd_t *hwpgd) +{ + int i; + pgd_t *pgd; + + if (!KVM_PREALLOC_LEVEL) + return hwpgd; + + /* + * When KVM_PREALLOC_LEVEL==2, we allocate a single page for + * the PMD and the kernel will use folded pud. + * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD + * pages. + */ + + pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), + GFP_KERNEL | __GFP_ZERO); + if (!pgd) + return ERR_PTR(-ENOMEM); + + /* Plug the HW PGD into the fake one. */ + for (i = 0; i < PTRS_PER_S2_PGD; i++) { + if (KVM_PREALLOC_LEVEL == 1) + pgd_populate(NULL, pgd + i, + (pud_t *)hwpgd + i * PTRS_PER_PUD); + else if (KVM_PREALLOC_LEVEL == 2) + pud_populate(NULL, pud_offset(pgd, 0) + i, + (pmd_t *)hwpgd + i * PTRS_PER_PMD); + } + + return pgd; +} + +static inline void kvm_free_fake_pgd(pgd_t *pgd) +{ + if (KVM_PREALLOC_LEVEL > 0) + kfree(pgd); +} static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); From 0dbd3b18c63c81dec1a8c47667d89c54ade9b52a Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 15 Mar 2016 10:46:34 +0000 Subject: [PATCH 04/29] arm64: Introduce pmd_thp_or_huge Add a helper to determine if a given pmd represents a huge page either by hugetlb or thp, as we have for arm. This will be used by KVM MMU code. Suggested-by: Mark Rutland Cc: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Acked-by: Marc Zyngier Acked-by: Will Deacon Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 989fef16d461..dda4aa9ba3f8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -290,6 +290,8 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) +#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) + #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) From bbb3b6b35087539e75792b46e07b7ce5282d0979 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 1 Mar 2016 12:00:39 +0000 Subject: [PATCH 05/29] kvm-arm: Replace kvm_pmd_huge with pmd_thp_or_huge Both arm and arm64 now provides a helper, pmd_thp_or_huge() to check if the given pmd represents a huge page. Use that instead of our own custom check. Suggested-by: Mark Rutland Cc: Marc Zyngier Acked-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/kvm/mmu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 774d00b8066b..7837f0afa5a4 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -45,7 +45,6 @@ static phys_addr_t hyp_idmap_vector; #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) #define kvm_pud_huge(_x) pud_huge(_x) #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) @@ -115,7 +114,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) */ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) { - if (!kvm_pmd_huge(*pmd)) + if (!pmd_thp_or_huge(*pmd)) return; pmd_clear(pmd); @@ -177,7 +176,7 @@ static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(kvm_pmd_huge(*pmd)); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); pte_free_kernel(NULL, pte_table); @@ -240,7 +239,7 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud, do { next = kvm_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; pmd_clear(pmd); @@ -326,7 +325,7 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, do { next = kvm_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) + if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); else stage2_flush_ptes(kvm, pmd, addr, next); @@ -1050,7 +1049,7 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) do { next = kvm_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) kvm_set_s2pmd_readonly(pmd); } else { @@ -1331,7 +1330,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) if (!pmd || pmd_none(*pmd)) /* Nothing there */ goto out; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; @@ -1555,7 +1554,7 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ if (pmd_young(*pmd)) { *pmd = pmd_mkold(*pmd); return 1; @@ -1585,7 +1584,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ return pmd_young(*pmd); pte = pte_offset_kernel(pmd, gpa); From 77b5665141a9a7e69d2f685ee2f2a3698fd27397 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 14:06:47 +0000 Subject: [PATCH 06/29] kvm-arm: Remove kvm_pud_huge() Get rid of kvm_pud_huge() which falls back to pud_huge. Use pud_huge instead. Acked-by: Christoffer Dall Acked-by: Marc Zyngier Signed-off-by: Suzuki K Poulose --- arch/arm/kvm/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7837f0afa5a4..d0c0ee92c378 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -45,8 +45,6 @@ static phys_addr_t hyp_idmap_vector; #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define kvm_pud_huge(_x) pud_huge(_x) - #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) @@ -1077,7 +1075,7 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) next = kvm_pud_addr_end(addr, end); if (!pud_none(*pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(kvm_pud_huge(*pud)); + BUG_ON(pud_huge(*pud)); stage2_wp_pmds(pud, addr, next); } } while (pud++, addr = next, addr != end); From b1ae9a30f18d22e1ae544e4c37a032d018ebdac5 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 14:08:17 +0000 Subject: [PATCH 07/29] kvm-arm: arm32: Introduce stage2 page table helpers Define the page table helpers for walking the stage2 pagetable for arm. Since both hyp and stage2 have the same number of levels, as that of the host we reuse the host helpers. The exceptions are the p.d_addr_end routines which have to deal with IPA > 32bit, hence we use the open coded version of their host helpers which supports 64bit. Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 1 + arch/arm/include/asm/stage2_pgtable.h | 61 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 arch/arm/include/asm/stage2_pgtable.h diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index c2b2b27b7da1..7d207b44a656 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -47,6 +47,7 @@ #include #include #include +#include int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h new file mode 100644 index 000000000000..460d616bb2d6 --- /dev/null +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * stage2 page table helpers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ARM_S2_PGTABLE_H_ +#define __ARM_S2_PGTABLE_H_ + +#define stage2_pgd_none(pgd) pgd_none(pgd) +#define stage2_pgd_clear(pgd) pgd_clear(pgd) +#define stage2_pgd_present(pgd) pgd_present(pgd) +#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(pud) pud_free(NULL, pud) + +#define stage2_pud_none(pud) pud_none(pud) +#define stage2_pud_clear(pud) pud_clear(pud) +#define stage2_pud_present(pud) pud_present(pud) +#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(pud) pud_huge(pud) + +/* Open coded p*d_addr_end that can deal with 64bit addresses */ +static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#define stage2_pud_addr_end(addr, end) (end) + +static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#define stage2_pgd_index(addr) pgd_index(addr) + +#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(pudp) false + +#endif /* __ARM_S2_PGTABLE_H_ */ From b1d030a73e9039dcdec951327e9ce9e8d682067f Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 17:15:55 +0000 Subject: [PATCH 08/29] kvm-arm: arm: Introduce hyp page table empty checks Introduce hyp_pxx_table_empty helpers for checking whether a given table entry is empty. This will be used explicitly once we switch to explicit routines for hyp page table walk. Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 7d207b44a656..5522cdd9dedf 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -160,7 +160,11 @@ static inline bool kvm_page_empty(void *ptr) #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) -#define kvm_pud_table_empty(kvm, pudp) (0) +#define kvm_pud_table_empty(kvm, pudp) false + +#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) +#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#define hyp_pud_table_empty(pudp) false static inline void *kvm_get_hwpgd(struct kvm *kvm) { From c0ef6326dd393d84c9906240164e803536b5f3fc Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 14:16:52 +0000 Subject: [PATCH 09/29] kvm-arm: arm64: Introduce stage2 page table helpers Introduce stage2 page table helpers for arm64. With the fake page table level still in place, the stage2 table has the same number of levels as that of the host (and hyp), so they all fallback to the host version. Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/kvm_mmu.h | 29 +-------- arch/arm64/include/asm/stage2_pgtable.h | 86 +++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 27 deletions(-) create mode 100644 arch/arm64/include/asm/stage2_pgtable.h diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 9a3409f7b37a..e3f53e3eb4d9 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -91,6 +91,8 @@ alternative_endif #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) +#include + int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); void free_boot_hyp_pgd(void); @@ -156,35 +158,8 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) #define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) #define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) -/* - * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address - * the entire IPA input range with a single pgd entry, and we would only need - * one pgd entry. Note that in this case, the pgd is actually not used by - * the MMU for Stage-2 translations, but is merely a fake pgd used as a data - * structure for the kernel pgtable macros to work. - */ -#if PGDIR_SHIFT > KVM_PHYS_SHIFT -#define PTRS_PER_S2_PGD_SHIFT 0 -#else -#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT) -#endif -#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) - #define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) -/* - * If we are concatenating first level stage-2 page tables, we would have less - * than or equal to 16 pointers in the fake PGD, because that's what the - * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) - * represents the first level for the host, and we add 1 to go to the next - * level (which uses contatenation) for the stage-2 tables. - */ -#if PTRS_PER_S2_PGD <= 16 -#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) -#else -#define KVM_PREALLOC_LEVEL (0) -#endif - static inline void *kvm_get_hwpgd(struct kvm *kvm) { pgd_t *pgd = kvm->arch.pgd; diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h new file mode 100644 index 000000000000..0ec218fe83c5 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * stage2 page table helpers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ARM64_S2_PGTABLE_H_ +#define __ARM64_S2_PGTABLE_H_ + +#include + +/* + * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address + * the entire IPA input range with a single pgd entry, and we would only need + * one pgd entry. Note that in this case, the pgd is actually not used by + * the MMU for Stage-2 translations, but is merely a fake pgd used as a data + * structure for the kernel pgtable macros to work. + */ +#if PGDIR_SHIFT > KVM_PHYS_SHIFT +#define PTRS_PER_S2_PGD_SHIFT 0 +#else +#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT) +#endif +#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) + +/* + * If we are concatenating first level stage-2 page tables, we would have less + * than or equal to 16 pointers in the fake PGD, because that's what the + * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) + * represents the first level for the host, and we add 1 to go to the next + * level (which uses contatenation) for the stage-2 tables. + */ +#if PTRS_PER_S2_PGD <= 16 +#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) +#else +#define KVM_PREALLOC_LEVEL (0) +#endif + +#define stage2_pgd_none(pgd) pgd_none(pgd) +#define stage2_pgd_clear(pgd) pgd_clear(pgd) +#define stage2_pgd_present(pgd) pgd_present(pgd) +#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(pud) pud_free(NULL, pud) + +#define stage2_pud_none(pud) pud_none(pud) +#define stage2_pud_clear(pud) pud_clear(pud) +#define stage2_pud_present(pud) pud_present(pud) +#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(pud) pud_huge(pud) + +#define stage2_pgd_addr_end(address, end) pgd_addr_end(address, end) +#define stage2_pud_addr_end(address, end) pud_addr_end(address, end) +#define stage2_pmd_addr_end(address, end) pmd_addr_end(address, end) + +#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +#ifdef __PGTABLE_PMD_FOLDED +#define stage2_pmd_table_empty(pmdp) (0) +#else +#define stage2_pmd_table_empty(pmdp) ((KVM_PREALLOC_LEVEL < 2) && kvm_page_empty(pmdp)) +#endif + +#ifdef __PGTABLE_PUD_FOLDED +#define stage2_pud_table_empty(pudp) (0) +#else +#define stage2_pud_table_empty(pudp) ((KVM_PREALLOC_LEVEL < 1) && kvm_page_empty(pudp)) +#endif + +#define stage2_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) + +#endif /* __ARM64_S2_PGTABLE_H_ */ From 66f877faf9cc23232f25b423758eaa167de1ad09 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 17:20:28 +0000 Subject: [PATCH 10/29] kvm-arm: arm64: Introduce hyp page table empty checks Introduce hyp_pxx_table_empty helpers for checking whether a given table entry is empty. This will be used explicitly once we switch to explicit routines for hyp page table walk. Acked-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/kvm_mmu.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e3f53e3eb4d9..edf3c62c660e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -249,6 +249,20 @@ static inline bool kvm_page_empty(void *ptr) #endif +#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) + +#ifdef __PAGETABLE_PMD_FOLDED +#define hyp_pmd_table_empty(pmdp) (0) +#else +#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +#endif + +#ifdef __PAGETABLE_PUD_FOLDED +#define hyp_pud_table_empty(pudp) (0) +#else +#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) +#endif + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) From 70fd19068573e449d47eb2daa69cf5db541ef4f5 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 18:33:45 +0000 Subject: [PATCH 11/29] kvm-arm: Use explicit stage2 helper routines We have stage2 page table helpers for both arm and arm64. Switch to the stage2 helpers for routines that only deal with stage2 page table. Cc: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/kvm/mmu.c | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index d0c0ee92c378..f93f717b5d8b 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -319,9 +319,9 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); @@ -337,11 +337,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -357,9 +357,9 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); + next = stage2_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -807,16 +807,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); - if (WARN_ON(pgd_none(*pgd))) { + pgd = kvm->arch.pgd + stage2_pgd_index(addr); + if (WARN_ON(stage2_pgd_none(*pgd))) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - pgd_populate(NULL, pgd, pud); + stage2_pgd_populate(pgd, pud); get_page(virt_to_page(pgd)); } - return pud_offset(pgd, addr); + return stage2_pud_offset(pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -826,15 +826,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (pud_none(*pud)) { + if (stage2_pud_none(*pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - pud_populate(NULL, pud, pmd); + stage2_pud_populate(pud, pmd); get_page(virt_to_page(pud)); } - return pmd_offset(pud, addr); + return stage2_pmd_offset(pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -1042,10 +1042,10 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) @@ -1070,12 +1070,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(pud_huge(*pud)); + BUG_ON(stage2_pud_huge(*pud)); stage2_wp_pmds(pud, addr, next); } } while (pud++, addr = next, addr != end); @@ -1092,7 +1092,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1104,8 +1104,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - next = kvm_pgd_addr_end(addr, end); - if (pgd_present(*pgd)) + next = stage2_pgd_addr_end(addr, end); + if (stage2_pgd_present(*pgd)) stage2_wp_puds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } From 64f324979210d4064adf64f19da40c125c9dd137 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 18:56:21 +0000 Subject: [PATCH 12/29] kvm-arm: Add explicit hyp page table modifiers We have common routines to modify hyp and stage2 page tables based on the 'kvm' parameter. For a smoother transition to using separate routines for each, duplicate the routines and modify the copy to work on hyp. Marks the forked routines with _hyp_ and gets rid of the kvm parameter which is no longer needed and is NULL for hyp. Also, gets rid of calls to kvm_tlb_flush_by_vmid_ipa() calls from the hyp versions. Uses explicit host page table accessors instead of the kvm_* page table helpers. Suggested-by: Christoffer Dall Cc: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/kvm/mmu.c | 104 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 99 insertions(+), 5 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index f93f717b5d8b..af526f67022c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -388,6 +388,100 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } +static void clear_hyp_pgd_entry(pgd_t *pgd) +{ + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + pgd_clear(pgd); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_hyp_pud_entry(pud_t *pud) +{ + pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_hyp_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (hyp_pte_table_empty(start_pte)) + clear_hyp_pmd_entry(pmd); +} + +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + /* Hyp doesn't use huge pmds */ + if (!pmd_none(*pmd)) + unmap_hyp_ptes(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (hyp_pmd_table_empty(start_pmd)) + clear_hyp_pud_entry(pud); +} + +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + /* Hyp doesn't use huge puds */ + if (!pud_none(*pud)) + unmap_hyp_pmds(pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (hyp_pud_table_empty(start_pud)) + clear_hyp_pgd_entry(pgd); +} + +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + /* + * We don't unmap anything from HYP, except at the hyp tear down. + * Hence, we don't have to invalidate the TLBs here. + */ + pgd = pgdp + pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_hyp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + /** * free_boot_hyp_pgd - free HYP boot page tables * @@ -398,14 +492,14 @@ void free_boot_hyp_pgd(void) mutex_lock(&kvm_hyp_pgd_mutex); if (boot_hyp_pgd) { - unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); boot_hyp_pgd = NULL; } if (hyp_pgd) - unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -430,9 +524,9 @@ void free_hyp_pgds(void) if (hyp_pgd) { for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; From 7a1c831ee8553b8199f21183942a46adf808f174 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 23 Mar 2016 12:08:02 +0000 Subject: [PATCH 13/29] kvm-arm: Add stage2 page table modifiers Now that the hyp page table is handled by different set of routines, rename the original shared routines to stage2 handlers. Also make explicit use of the stage2 page table helpers. unmap_range has been merged to existing unmap_stage2_range. Cc: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/kvm/mmu.c | 97 +++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index af526f67022c..f2a6d9b8ca2d 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -152,26 +152,26 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } -static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) +static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); - pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); + stage2_pgd_clear(pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - pud_free(NULL, pud_table); + stage2_pud_free(pud_table); put_page(virt_to_page(pgd)); } -static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) +static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); + VM_BUG_ON(stage2_pud_huge(*pud)); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + stage2_pmd_free(pmd_table); put_page(virt_to_page(pud)); } -static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) +static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { pte_t *pte_table = pte_offset_kernel(pmd, 0); VM_BUG_ON(pmd_thp_or_huge(*pmd)); @@ -201,7 +201,7 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure * the IO subsystem will never hit in the cache. */ -static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, +static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) { phys_addr_t start_addr = addr; @@ -223,19 +223,19 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (kvm_pte_table_empty(kvm, start_pte)) - clear_pmd_entry(kvm, pmd, start_addr); + if (stage2_pte_table_empty(start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); } -static void unmap_pmds(struct kvm *kvm, pud_t *pud, +static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; @@ -247,57 +247,64 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud, put_page(virt_to_page(pmd)); } else { - unmap_ptes(kvm, pmd, addr, next); + unmap_stage2_ptes(kvm, pmd, addr, next); } } } while (pmd++, addr = next, addr != end); - if (kvm_pmd_table_empty(kvm, start_pmd)) - clear_pud_entry(kvm, pud, start_addr); + if (stage2_pmd_table_empty(start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) { pud_t old_pud = *pud; - pud_clear(pud); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); } else { - unmap_pmds(kvm, pud, addr, next); + unmap_stage2_pmds(kvm, pud, addr, next); } } } while (pud++, addr = next, addr != end); - if (kvm_pud_table_empty(kvm, start_pud)) - clear_pgd_entry(kvm, pgd, start_addr); + if (stage2_pud_table_empty(start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); } - -static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - phys_addr_t start, u64 size) +/** + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @kvm: The VM pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) { pgd_t *pgd; phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_puds(kvm, pgd, addr, next); + next = stage2_pgd_addr_end(addr, end); + if (!stage2_pgd_none(*pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -792,22 +799,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) return 0; } -/** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer - * @start: The intermediate physical base address of the range to unmap - * @size: The size of the area to unmap - * - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must - * be called while holding mmu_lock (unless for freeing the stage2 pgd before - * destroying the VM), otherwise another faulting VCPU may come in and mess - * with things behind our backs. - */ -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) -{ - unmap_range(kvm, kvm->arch.pgd, start, size); -} - static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { From 8684e701df5a3f52e3ff580128cbd5d71fcd5f5c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 17:14:25 +0000 Subject: [PATCH 14/29] kvm-arm: Cleanup kvm_* wrappers Now that we have switched to explicit page table routines, get rid of the obsolete kvm_* wrappers. Also, kvm_tlb_flush_vmid_by_ipa is now called only on stage2 page tables, hence get rid of the redundant check. Cc: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/include/asm/kvm_mmu.h | 16 ---------------- arch/arm/kvm/mmu.c | 9 +-------- arch/arm64/include/asm/kvm_mmu.h | 24 ------------------------ 3 files changed, 1 insertion(+), 48 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5522cdd9dedf..a7736d53a408 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -136,22 +136,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; } - -/* Open coded p*d_addr_end that can deal with 64bit addresses */ -#define kvm_pgd_addr_end(addr, end) \ -({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ - (__boundary - 1 < (end) - 1)? __boundary: (end); \ -}) - -#define kvm_pud_addr_end(addr,end) (end) - -#define kvm_pmd_addr_end(addr, end) \ -({ u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ - (__boundary - 1 < (end) - 1)? __boundary: (end); \ -}) - -#define kvm_pgd_index(addr) pgd_index(addr) - static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index f2a6d9b8ca2d..d3fa96e0f709 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -66,14 +66,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { - /* - * This function also gets called when dealing with HYP page - * tables. As HYP doesn't have an associated struct kvm (and - * the HYP page tables are fairly static), we don't do - * anything there. - */ - if (kvm) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } /* diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index edf3c62c660e..a3c0d05311ef 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -153,13 +153,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY; } - -#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end) -#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) -#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) - -#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) - static inline void *kvm_get_hwpgd(struct kvm *kvm) { pgd_t *pgd = kvm->arch.pgd; @@ -232,23 +225,6 @@ static inline bool kvm_page_empty(void *ptr) return page_count(ptr_page) == 1; } -#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) - -#ifdef __PAGETABLE_PMD_FOLDED -#define kvm_pmd_table_empty(kvm, pmdp) (0) -#else -#define kvm_pmd_table_empty(kvm, pmdp) \ - (kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2)) -#endif - -#ifdef __PAGETABLE_PUD_FOLDED -#define kvm_pud_table_empty(kvm, pudp) (0) -#else -#define kvm_pud_table_empty(kvm, pudp) \ - (kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1)) -#endif - - #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED From da04fa04dc91e7dae79629f28804391cbcf6e604 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 23 Mar 2016 12:22:33 +0000 Subject: [PATCH 15/29] kvm: arm64: Get rid of fake page table levels On arm64, the hardware supports concatenation of upto 16 tables, at entry level for stage2 translations and we make use that whenever possible. This could lead to reduced number of translation levels than the normal (stage1 table) table. Also, since the IPA(40bit) is smaller than the some of the supported VA_BITS (e.g, 48bit), there could be different number of levels in stage-1 vs stage-2 tables. To reuse the kernel host page table walker for stage2 we have been using a fake software page table level, not known to the hardware. But with 16K translations, there could be upto 2 fake software levels (with 48bit VA and 40bit IPA), which complicates the code. Hence, we want to get rid of the hack. Now that we have explicit accessors for hyp vs stage2 page tables, define the stage2 walker helpers accordingly based on the actual table used by the hardware. Once we know the number of translation levels used by the hardware, it is merely a job of defining the helpers based on whether a particular level is folded or not, looking at the number of levels. Some facts before we calculate the translation levels: 1) Smallest page size supported by arm64 is 4K. 2) The minimum number of bits resolved at any page table level is (PAGE_SHIFT - 3) at intermediate levels. Both of them implies, minimum number of bits required for a level change is 9. Since we can concatenate upto 16 tables at stage2 entry, the total number of page table levels used by the hardware for resolving N bits is same as that for (N - 4) bits (with concatenation), as there cannot be a level in between (N, N-4) as per the above rules. Hence, we have STAGE2_PGTABLE_LEVELS = PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) With the current IPA limit (40bit), for all supported translations and VA_BITS, we have the following condition (even for 36bit VA with 16K page size): CONFIG_PGTABLE_LEVELS >= STAGE2_PGTABLE_LEVELS. So, for e.g, if PUD is present in stage2, it is present in the hyp(host). Hence, we fall back to the host definition if we find that a level is not folded. Otherwise we redefine it accordingly. A build time check is added to make sure the above condition holds. If this condition breaks in future, we can rearrange the host level helpers and fix our code easily. Cc: Marc Zyngier Cc: Christoffer Dall Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/kvm_mmu.h | 64 +-------- arch/arm64/include/asm/stage2_pgtable-nopmd.h | 42 ++++++ arch/arm64/include/asm/stage2_pgtable-nopud.h | 39 ++++++ arch/arm64/include/asm/stage2_pgtable.h | 122 +++++++++++++----- 4 files changed, 172 insertions(+), 95 deletions(-) create mode 100644 arch/arm64/include/asm/stage2_pgtable-nopmd.h create mode 100644 arch/arm64/include/asm/stage2_pgtable-nopud.h diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a3c0d05311ef..e3fee0acd1a2 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -45,18 +45,6 @@ */ #define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD and potentially the PUD which are - * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2 - * tables use one level of tables less than the kernel. - */ -#ifdef CONFIG_ARM64_64K_PAGES -#define KVM_MMU_CACHE_MIN_PAGES 1 -#else -#define KVM_MMU_CACHE_MIN_PAGES 2 -#endif - #ifdef __ASSEMBLY__ #include @@ -155,69 +143,21 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) static inline void *kvm_get_hwpgd(struct kvm *kvm) { - pgd_t *pgd = kvm->arch.pgd; - pud_t *pud; - - if (KVM_PREALLOC_LEVEL == 0) - return pgd; - - pud = pud_offset(pgd, 0); - if (KVM_PREALLOC_LEVEL == 1) - return pud; - - BUG_ON(KVM_PREALLOC_LEVEL != 2); - return pmd_offset(pud, 0); + return kvm->arch.pgd; } static inline unsigned int kvm_get_hwpgd_size(void) { - if (KVM_PREALLOC_LEVEL > 0) - return PTRS_PER_S2_PGD * PAGE_SIZE; return PTRS_PER_S2_PGD * sizeof(pgd_t); } -/* - * Allocate fake pgd for the host kernel page table macros to work. - * This is not used by the hardware and we have no alignment - * requirement for this allocation. - */ static inline pgd_t *kvm_setup_fake_pgd(pgd_t *hwpgd) { - int i; - pgd_t *pgd; - - if (!KVM_PREALLOC_LEVEL) - return hwpgd; - - /* - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for - * the PMD and the kernel will use folded pud. - * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD - * pages. - */ - - pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), - GFP_KERNEL | __GFP_ZERO); - if (!pgd) - return ERR_PTR(-ENOMEM); - - /* Plug the HW PGD into the fake one. */ - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - - return pgd; + return hwpgd; } static inline void kvm_free_fake_pgd(pgd_t *pgd) { - if (KVM_PREALLOC_LEVEL > 0) - kfree(pgd); } static inline bool kvm_page_empty(void *ptr) { diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h new file mode 100644 index 000000000000..2656a0fd05a6 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable-nopmd.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ +#define __ARM64_S2_PGTABLE_NOPMD_H_ + +#include + +#define __S2_PGTABLE_PMD_FOLDED + +#define S2_PMD_SHIFT S2_PUD_SHIFT +#define S2_PTRS_PER_PMD 1 +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) +#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) + +#define stage2_pud_none(pud) (0) +#define stage2_pud_present(pud) (1) +#define stage2_pud_clear(pud) do { } while (0) +#define stage2_pud_populate(pud, pmd) do { } while (0) +#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) + +#define stage2_pmd_free(pmd) do { } while (0) + +#define stage2_pmd_addr_end(addr, end) (end) + +#define stage2_pud_huge(pud) (0) +#define stage2_pmd_table_empty(pmdp) (0) + +#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h new file mode 100644 index 000000000000..5ee87b54ebf3 --- /dev/null +++ b/arch/arm64/include/asm/stage2_pgtable-nopud.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2016 - ARM Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ +#define __ARM64_S2_PGTABLE_NOPUD_H_ + +#define __S2_PGTABLE_PUD_FOLDED + +#define S2_PUD_SHIFT S2_PGDIR_SHIFT +#define S2_PTRS_PER_PUD 1 +#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) + +#define stage2_pgd_none(pgd) (0) +#define stage2_pgd_present(pgd) (1) +#define stage2_pgd_clear(pgd) do { } while (0) +#define stage2_pgd_populate(pgd, pud) do { } while (0) + +#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) + +#define stage2_pud_free(x) do { } while (0) + +#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_table_empty(pmdp) (0) + +#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 0ec218fe83c5..8b68099348e5 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -22,32 +22,61 @@ #include /* - * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address - * the entire IPA input range with a single pgd entry, and we would only need - * one pgd entry. Note that in this case, the pgd is actually not used by - * the MMU for Stage-2 translations, but is merely a fake pgd used as a data - * structure for the kernel pgtable macros to work. + * The hardware supports concatenation of up to 16 tables at stage2 entry level + * and we use the feature whenever possible. + * + * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). + * On arm64, the smallest PAGE_SIZE supported is 4k, which means + * (PAGE_SHIFT - 3) > 4 holds for all page sizes. + * This implies, the total number of page table levels at stage2 expected + * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * in normal translations(e.g, stage1), since we cannot have another level in + * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). */ -#if PGDIR_SHIFT > KVM_PHYS_SHIFT -#define PTRS_PER_S2_PGD_SHIFT 0 -#else -#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT) -#endif -#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) +#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) /* - * If we are concatenating first level stage-2 page tables, we would have less - * than or equal to 16 pointers in the fake PGD, because that's what the - * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) - * represents the first level for the host, and we add 1 to go to the next - * level (which uses contatenation) for the stage-2 tables. + * With all the supported VA_BITs and 40bit guest IPA, the following condition + * is always true: + * + * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS + * + * We base our stage-2 page table walker helpers on this assumption and + * fall back to using the host version of the helper wherever possible. + * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back + * to using the host version, since it is guaranteed it is not folded at host. + * + * If the condition breaks in the future, we can rearrange the host level + * definitions and reuse them for stage2. Till then... */ -#if PTRS_PER_S2_PGD <= 16 -#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) -#else -#define KVM_PREALLOC_LEVEL (0) +#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS +#error "Unsupported combination of guest IPA and host VA_BITS." #endif +/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ +#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) +#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) +#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) + +/* + * The number of PTRS across all concatenated stage2 tables given by the + * number of bits resolved at the initial level. + */ +#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) + +/* + * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation + * levels in addition to the PGD. + */ +#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) + + +#if STAGE2_PGTABLE_LEVELS > 3 + +#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) +#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) + #define stage2_pgd_none(pgd) pgd_none(pgd) #define stage2_pgd_clear(pgd) pgd_clear(pgd) #define stage2_pgd_present(pgd) pgd_present(pgd) @@ -55,6 +84,24 @@ #define stage2_pud_offset(pgd, address) pud_offset(pgd, address) #define stage2_pud_free(pud) pud_free(NULL, pud) +#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) + +static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* STAGE2_PGTABLE_LEVELS > 3 */ + + +#if STAGE2_PGTABLE_LEVELS > 2 + +#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) +#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) + #define stage2_pud_none(pud) pud_none(pud) #define stage2_pud_clear(pud) pud_clear(pud) #define stage2_pud_present(pud) pud_present(pud) @@ -63,24 +110,33 @@ #define stage2_pmd_free(pmd) pmd_free(NULL, pmd) #define stage2_pud_huge(pud) pud_huge(pud) +#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define stage2_pgd_addr_end(address, end) pgd_addr_end(address, end) -#define stage2_pud_addr_end(address, end) pud_addr_end(address, end) -#define stage2_pmd_addr_end(address, end) pmd_addr_end(address, end) +static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} + +#endif /* STAGE2_PGTABLE_LEVELS > 2 */ #define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) -#ifdef __PGTABLE_PMD_FOLDED -#define stage2_pmd_table_empty(pmdp) (0) -#else -#define stage2_pmd_table_empty(pmdp) ((KVM_PREALLOC_LEVEL < 2) && kvm_page_empty(pmdp)) + +#if STAGE2_PGTABLE_LEVELS == 2 +#include +#elif STAGE2_PGTABLE_LEVELS == 3 +#include #endif -#ifdef __PGTABLE_PUD_FOLDED -#define stage2_pud_table_empty(pudp) (0) -#else -#define stage2_pud_table_empty(pudp) ((KVM_PREALLOC_LEVEL < 1) && kvm_page_empty(pudp)) -#endif -#define stage2_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) +#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) + +static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; +} #endif /* __ARM64_S2_PGTABLE_H_ */ From 9163ee23e72333e4712f7edd1a49aef06eae6304 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Mar 2016 17:01:21 +0000 Subject: [PATCH 16/29] kvm-arm: Cleanup stage2 pgd handling Now that we don't have any fake page table levels for arm64, cleanup the common code to get rid of the dead code. Cc: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm/include/asm/kvm_mmu.h | 19 ---------------- arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/mmu.c | 37 ++++++-------------------------- arch/arm64/include/asm/kvm_mmu.h | 18 ---------------- 4 files changed, 7 insertions(+), 69 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index a7736d53a408..6344ea0ad624 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -150,25 +150,6 @@ static inline bool kvm_page_empty(void *ptr) #define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) #define hyp_pud_table_empty(pudp) false -static inline void *kvm_get_hwpgd(struct kvm *kvm) -{ - return kvm->arch.pgd; -} - -static inline unsigned int kvm_get_hwpgd_size(void) -{ - return PTRS_PER_S2_PGD * sizeof(pgd_t); -} - -static inline pgd_t *kvm_setup_fake_pgd(pgd_t *hwpgd) -{ - return hwpgd; -} - -static inline void kvm_free_fake_pgd(pgd_t *pgd) -{ -} - struct kvm; #define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dded1b763c16..be4b6394a062 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -448,7 +448,7 @@ static void update_vttbr(struct kvm *kvm) kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm)); + pgd_phys = virt_to_phys(kvm->arch.pgd); BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = pgd_phys | vmid; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index d3fa96e0f709..42eefab3e8e1 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -43,6 +43,7 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) @@ -736,20 +737,6 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } -/* Free the HW pgd, one page at a time */ -static void kvm_free_hwpgd(void *hwpgd) -{ - free_pages_exact(hwpgd, kvm_get_hwpgd_size()); -} - -/* Allocate the HW PGD, making sure that each page gets its own refcount */ -static void *kvm_alloc_hwpgd(void) -{ - unsigned int size = kvm_get_hwpgd_size(); - - return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); -} - /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -764,29 +751,17 @@ static void *kvm_alloc_hwpgd(void) int kvm_alloc_stage2_pgd(struct kvm *kvm) { pgd_t *pgd; - void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - hwpgd = kvm_alloc_hwpgd(); - if (!hwpgd) + /* Allocate the HW PGD, making sure that each page gets its own refcount */ + pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + if (!pgd) return -ENOMEM; - /* - * When the kernel uses more levels of page tables than the - * guest, we allocate a fake PGD and pre-populate it to point - * to the next-level page table, which will be the real - * initial page table pointed to by the VTTBR. - */ - pgd = kvm_setup_fake_pgd(hwpgd); - if (IS_ERR(pgd)) { - kvm_free_hwpgd(hwpgd); - return PTR_ERR(pgd); - } - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; @@ -874,8 +849,8 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - kvm_free_hwpgd(kvm_get_hwpgd(kvm)); - kvm_free_fake_pgd(kvm->arch.pgd); + /* Free the HW pgd, one page at a time */ + free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE); kvm->arch.pgd = NULL; } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e3fee0acd1a2..249c4fc9c5f6 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -141,24 +141,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY; } -static inline void *kvm_get_hwpgd(struct kvm *kvm) -{ - return kvm->arch.pgd; -} - -static inline unsigned int kvm_get_hwpgd_size(void) -{ - return PTRS_PER_S2_PGD * sizeof(pgd_t); -} - -static inline pgd_t *kvm_setup_fake_pgd(pgd_t *hwpgd) -{ - return hwpgd; -} - -static inline void kvm_free_fake_pgd(pgd_t *pgd) -{ -} static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); From 02e0b7600f8350078f01328095c20dd715700921 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Mar 2016 14:29:24 +0000 Subject: [PATCH 17/29] arm64: kvm: Add support for 16K pages Now that we can handle stage-2 page tables independent of the host page table levels, wire up the 16K page support. Cc: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose --- arch/arm64/include/asm/kvm_arm.h | 13 +++++++++++-- arch/arm64/kvm/Kconfig | 1 - 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1281d98392a0..c6cbb361bbcf 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -114,6 +114,7 @@ #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK #define VTCR_EL2_TG0_MASK TCR_TG0_MASK #define VTCR_EL2_TG0_4K TCR_TG0_4K +#define VTCR_EL2_TG0_16K TCR_TG0_16K #define VTCR_EL2_TG0_64K TCR_TG0_64K #define VTCR_EL2_SH0_MASK TCR_SH0_MASK #define VTCR_EL2_SH0_INNER TCR_SH0_INNER @@ -139,7 +140,7 @@ * (see hyp-init.S). * * Note that when using 4K pages, we concatenate two first level page tables - * together. + * together. With 16K pages, we concatenate 16 first level page tables. * * The magic numbers used for VTTBR_X in this patch can be found in Tables * D4-23 and D4-25 in ARM DDI 0487A.b. @@ -157,7 +158,15 @@ */ #define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) #define VTTBR_X_TGRAN_MAGIC 38 -#else +#elif defined(CONFIG_ARM64_16K_PAGES) +/* + * Stage2 translation configuration: + * 16kB pages (TG0 = 2) + * 2 level page tables (SL = 1) + */ +#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) +#define VTTBR_X_TGRAN_MAGIC 42 +#else /* 4K */ /* * Stage2 translation configuration: * 4kB pages (TG0 = 0) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index de7450df7629..aa2e34e99582 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM_ARM_VGIC_V3 config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF - depends on !ARM64_16K_PAGES select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES From d4b9e0790aa764c0b01e18d4e8d33e93ba36d51f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 28 Apr 2016 16:16:31 +0100 Subject: [PATCH 18/29] arm/arm64: KVM: Enforce Break-Before-Make on Stage-2 page tables The ARM architecture mandates that when changing a page table entry from a valid entry to another valid entry, an invalid entry is first written, TLB invalidated, and only then the new entry being written. The current code doesn't respect this, directly writing the new entry and only then invalidating TLBs. Let's fix it up. Cc: Reported-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 42eefab3e8e1..74b5d199f6b7 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -910,11 +910,14 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); old_pmd = *pmd; - kvm_set_pmd(pmd, *new_pmd); - if (pmd_present(old_pmd)) + if (pmd_present(old_pmd)) { + pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); - else + } else { get_page(virt_to_page(pmd)); + } + + kvm_set_pmd(pmd, *new_pmd); return 0; } @@ -963,12 +966,14 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, /* Create 2nd stage page table mapping - Level 3 */ old_pte = *pte; - kvm_set_pte(pte, *new_pte); - if (pte_present(old_pte)) + if (pte_present(old_pte)) { + kvm_set_pte(pte, __pte(0)); kvm_tlb_flush_vmid_ipa(kvm, addr); - else + } else { get_page(virt_to_page(pte)); + } + kvm_set_pte(pte, *new_pte); return 0; } From b4d6ce9776e0fb773418efe8bc81d8c5ccca3493 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:51 +0100 Subject: [PATCH 19/29] clocksource: arm_arch_timer: Gather KVM specific information in a structure Introduce a structure which are filled up by the arch timer driver and used by the virtual timer in KVM. The first member of this structure will be the timecounter. More members will be added later. A stub for the new helper isn't introduced because KVM requires the arch timer for both ARM64 and ARM32. The function arch_timer_get_timecounter is kept for the time being and will be dropped in a subsequent patch. Signed-off-by: Julien Grall Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- drivers/clocksource/arm_arch_timer.c | 12 +++++++++--- include/clocksource/arm_arch_timer.h | 5 +++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 5152b3898155..62bdfe7067e3 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -468,11 +468,16 @@ static struct cyclecounter cyclecounter = { .mask = CLOCKSOURCE_MASK(56), }; -static struct timecounter timecounter; +static struct arch_timer_kvm_info arch_timer_kvm_info; + +struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) +{ + return &arch_timer_kvm_info; +} struct timecounter *arch_timer_get_timecounter(void) { - return &timecounter; + return &arch_timer_kvm_info.timecounter; } static void __init arch_counter_register(unsigned type) @@ -500,7 +505,8 @@ static void __init arch_counter_register(unsigned type) clocksource_register_hz(&clocksource_counter, arch_timer_rate); cyclecounter.mult = clocksource_counter.mult; cyclecounter.shift = clocksource_counter.shift; - timecounter_init(&timecounter, &cyclecounter, start_count); + timecounter_init(&arch_timer_kvm_info.timecounter, + &cyclecounter, start_count); /* 56 bits minimum, so we assume worst case rollover */ sched_clock_register(arch_timer_read_counter, 56, arch_timer_rate); diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 25d0914481a2..9101ed6b5550 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -49,11 +49,16 @@ enum arch_timer_reg { #define ARCH_TIMER_EVT_STREAM_FREQ 10000 /* 100us */ +struct arch_timer_kvm_info { + struct timecounter timecounter; +}; + #ifdef CONFIG_ARM_ARCH_TIMER extern u32 arch_timer_get_rate(void); extern u64 (*arch_timer_read_counter)(void); extern struct timecounter *arch_timer_get_timecounter(void); +extern struct arch_timer_kvm_info *arch_timer_get_kvm_info(void); #else From d9b5e41591ca6bc6678e287b5ffe7fac2e07436e Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:52 +0100 Subject: [PATCH 20/29] clocksource: arm_arch_timer: Extend arch_timer_kvm_info to get the virtual IRQ Currently, the firmware table is parsed by the virtual timer code in order to retrieve the virtual timer interrupt. However, this is already done by the arch timer driver. To avoid code duplication, extend arch_timer_kvm_info to get the virtual IRQ. Note that the KVM code will be modified in a subsequent patch. Signed-off-by: Julien Grall Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- drivers/clocksource/arm_arch_timer.c | 2 ++ include/clocksource/arm_arch_timer.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 62bdfe7067e3..bb58224eea36 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -750,6 +750,8 @@ static void __init arch_timer_init(void) arch_timer_register(); arch_timer_common_init(); + + arch_timer_kvm_info.virtual_irq = arch_timer_ppi[VIRT_PPI]; } static void __init arch_timer_of_init(struct device_node *np) diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 9101ed6b5550..9dd996a67e9e 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -51,6 +51,7 @@ enum arch_timer_reg { struct arch_timer_kvm_info { struct timecounter timecounter; + int virtual_irq; }; #ifdef CONFIG_ARM_ARCH_TIMER From bafa9193d00c7bfff4c4aea0d48cd2b55a6378c2 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:53 +0100 Subject: [PATCH 21/29] irqchip/gic-v2: Gather ACPI specific data in a single structure The ACPI code requires to use global variables in order to collect information from the tables. For now, a single global variable is used, but more will be added in a subsequent patch. To make clear they are ACPI specific, gather all the information in a single structure. Signed-off-by: Julien Grall Acked-by: Christofer Dall Acked-by: Hanjun Guo Signed-off-by: Christoffer Dall --- drivers/irqchip/irq-gic.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 282344b95ec2..7a73786596cd 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1245,7 +1245,10 @@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init); #endif #ifdef CONFIG_ACPI -static phys_addr_t cpu_phy_base __initdata; +static struct +{ + phys_addr_t cpu_phys_base; +} acpi_data __initdata; static int __init gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, @@ -1265,10 +1268,10 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, * All CPU interface addresses have to be the same. */ gic_cpu_base = processor->base_address; - if (cpu_base_assigned && gic_cpu_base != cpu_phy_base) + if (cpu_base_assigned && gic_cpu_base != acpi_data.cpu_phys_base) return -EINVAL; - cpu_phy_base = gic_cpu_base; + acpi_data.cpu_phys_base = gic_cpu_base; cpu_base_assigned = 1; return 0; } @@ -1316,7 +1319,7 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, return -EINVAL; } - cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE); + cpu_base = ioremap(acpi_data.cpu_phys_base, ACPI_GIC_CPU_IF_MEM_SIZE); if (!cpu_base) { pr_err("Unable to map GICC registers\n"); return -ENOMEM; From 502d6df11ae394301470703fa6e485a0dc133401 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:54 +0100 Subject: [PATCH 22/29] irqchip/gic-v2: Parse and export virtual GIC information For now, the firmware tables are parsed 2 times: once in the GIC drivers, the other timer when initializing the vGIC. It means code duplication and make more tedious to add the support for another firmware table (like ACPI). Introduce a new structure and set of helpers to get/set the virtual GIC information. Also fill up the structure for GICv2. Signed-off-by: Julien Grall Signed-off-by: Christoffer Dall --- drivers/irqchip/irq-gic-common.c | 13 +++++ drivers/irqchip/irq-gic-common.h | 3 + drivers/irqchip/irq-gic.c | 76 +++++++++++++++++++++++++- include/linux/irqchip/arm-gic-common.h | 33 +++++++++++ 4 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 include/linux/irqchip/arm-gic-common.h diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c index f174ce0ca361..2e9443be2b14 100644 --- a/drivers/irqchip/irq-gic-common.c +++ b/drivers/irqchip/irq-gic-common.c @@ -21,6 +21,19 @@ #include "irq-gic-common.h" +static const struct gic_kvm_info *gic_kvm_info; + +const struct gic_kvm_info *gic_get_kvm_info(void) +{ + return gic_kvm_info; +} + +void gic_set_kvm_info(const struct gic_kvm_info *info) +{ + BUG_ON(gic_kvm_info != NULL); + gic_kvm_info = info; +} + void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks, void *data) { diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h index fff697db8e22..205e5fddf6da 100644 --- a/drivers/irqchip/irq-gic-common.h +++ b/drivers/irqchip/irq-gic-common.h @@ -19,6 +19,7 @@ #include #include +#include struct gic_quirk { const char *desc; @@ -35,4 +36,6 @@ void gic_cpu_config(void __iomem *base, void (*sync_access)(void)); void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks, void *data); +void gic_set_kvm_info(const struct gic_kvm_info *info); + #endif /* _IRQ_GIC_COMMON_H */ diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 7a73786596cd..3f1d9fd3a462 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -102,6 +102,8 @@ static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE; static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly; +static struct gic_kvm_info gic_v2_kvm_info; + #ifdef CONFIG_GIC_NON_BANKED static void __iomem *gic_get_percpu_base(union gic_base *base) { @@ -1189,6 +1191,29 @@ static bool gic_check_eoimode(struct device_node *node, void __iomem **base) return true; } +static void __init gic_of_setup_kvm_info(struct device_node *node) +{ + int ret; + struct resource *vctrl_res = &gic_v2_kvm_info.vctrl; + struct resource *vcpu_res = &gic_v2_kvm_info.vcpu; + + gic_v2_kvm_info.type = GIC_V2; + + gic_v2_kvm_info.maint_irq = irq_of_parse_and_map(node, 0); + if (!gic_v2_kvm_info.maint_irq) + return; + + ret = of_address_to_resource(node, 2, vctrl_res); + if (ret) + return; + + ret = of_address_to_resource(node, 3, vcpu_res); + if (ret) + return; + + gic_set_kvm_info(&gic_v2_kvm_info); +} + int __init gic_of_init(struct device_node *node, struct device_node *parent) { @@ -1218,8 +1243,10 @@ gic_of_init(struct device_node *node, struct device_node *parent) __gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, &node->fwnode); - if (!gic_cnt) + if (!gic_cnt) { gic_init_physaddr(node); + gic_of_setup_kvm_info(node); + } if (parent) { irq = irq_of_parse_and_map(node, 0); @@ -1248,6 +1275,10 @@ IRQCHIP_DECLARE(pl390, "arm,pl390", gic_of_init); static struct { phys_addr_t cpu_phys_base; + u32 maint_irq; + int maint_irq_mode; + phys_addr_t vctrl_base; + phys_addr_t vcpu_base; } acpi_data __initdata; static int __init @@ -1272,6 +1303,12 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, return -EINVAL; acpi_data.cpu_phys_base = gic_cpu_base; + acpi_data.maint_irq = processor->vgic_interrupt; + acpi_data.maint_irq_mode = (processor->flags & ACPI_MADT_VGIC_IRQ_MODE) ? + ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE; + acpi_data.vctrl_base = processor->gich_base_address; + acpi_data.vcpu_base = processor->gicv_base_address; + cpu_base_assigned = 1; return 0; } @@ -1302,6 +1339,41 @@ static bool __init gic_validate_dist(struct acpi_subtable_header *header, #define ACPI_GICV2_DIST_MEM_SIZE (SZ_4K) #define ACPI_GIC_CPU_IF_MEM_SIZE (SZ_8K) +#define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K) +#define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K) + +static void __init gic_acpi_setup_kvm_info(void) +{ + int irq; + struct resource *vctrl_res = &gic_v2_kvm_info.vctrl; + struct resource *vcpu_res = &gic_v2_kvm_info.vcpu; + + gic_v2_kvm_info.type = GIC_V2; + + if (!acpi_data.vctrl_base) + return; + + vctrl_res->flags = IORESOURCE_MEM; + vctrl_res->start = acpi_data.vctrl_base; + vctrl_res->end = vctrl_res->start + ACPI_GICV2_VCTRL_MEM_SIZE - 1; + + if (!acpi_data.vcpu_base) + return; + + vcpu_res->flags = IORESOURCE_MEM; + vcpu_res->start = acpi_data.vcpu_base; + vcpu_res->end = vcpu_res->start + ACPI_GICV2_VCPU_MEM_SIZE - 1; + + irq = acpi_register_gsi(NULL, acpi_data.maint_irq, + acpi_data.maint_irq_mode, + ACPI_ACTIVE_HIGH); + if (irq <= 0) + return; + + gic_v2_kvm_info.maint_irq = irq; + + gic_set_kvm_info(&gic_v2_kvm_info); +} static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, const unsigned long end) @@ -1359,6 +1431,8 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, if (IS_ENABLED(CONFIG_ARM_GIC_V2M)) gicv2m_init(NULL, gic_data[0].domain); + gic_acpi_setup_kvm_info(); + return 0; } IRQCHIP_ACPI_DECLARE(gic_v2, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h new file mode 100644 index 000000000000..ef34f6f35e91 --- /dev/null +++ b/include/linux/irqchip/arm-gic-common.h @@ -0,0 +1,33 @@ +/* + * include/linux/irqchip/arm-gic-common.h + * + * Copyright (C) 2016 ARM Limited, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __LINUX_IRQCHIP_ARM_GIC_COMMON_H +#define __LINUX_IRQCHIP_ARM_GIC_COMMON_H + +#include +#include + +enum gic_type { + GIC_V2, +}; + +struct gic_kvm_info { + /* GIC type */ + enum gic_type type; + /* Virtual CPU interface */ + struct resource vcpu; + /* Interrupt number */ + unsigned int maint_irq; + /* Virtual control interface */ + struct resource vctrl; +}; + +const struct gic_kvm_info *gic_get_kvm_info(void); + +#endif /* __LINUX_IRQCHIP_ARM_GIC_COMMON_H */ From 68628bb87fe559969b92c8d5c5fd78d8dea5676b Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:55 +0100 Subject: [PATCH 23/29] irqchip/gic-v3: Prefix all pr_* messages by "GICv3: " Currently, most of the pr_* messages in the GICv3 driver don't have a prefix. Add one to make clear where the messages come from. Signed-off-by: Julien Grall Signed-off-by: Christoffer Dall --- drivers/irqchip/irq-gic-v3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 5b7d3c2129d8..6dc6f039e7eb 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -15,6 +15,8 @@ * along with this program. If not, see . */ +#define pr_fmt(fmt) "GICv3: " fmt + #include #include #include From 611f039f5ee4f18a2941574857390cf8765184d5 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:56 +0100 Subject: [PATCH 24/29] irqchip/gic-v3: Gather all ACPI specific data in a single structure The ACPI code requires to use global variables in order to collect information from the tables. To make clear those variables are ACPI specific, gather all of them in a single structure. Furthermore, even if some of the variables are not marked with __initdata, they are all only used during the initialization. Therefore, the new variable, which hold the structure, can be marked with __initdata. Signed-off-by: Julien Grall Acked-by: Christoffer Dall Reviewed-by: Hanjun Guo Signed-off-by: Christoffer Dall --- drivers/irqchip/irq-gic-v3.c | 60 ++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 6dc6f039e7eb..6a9f5ff161a4 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -970,19 +970,22 @@ out_unmap_dist: IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); #ifdef CONFIG_ACPI -static void __iomem *dist_base; -static struct redist_region *redist_regs __initdata; -static u32 nr_redist_regions __initdata; -static bool single_redist; +static struct +{ + void __iomem *dist_base; + struct redist_region *redist_regs; + u32 nr_redist_regions; + bool single_redist; +} acpi_data __initdata; static void __init gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) { static int count = 0; - redist_regs[count].phys_base = phys_base; - redist_regs[count].redist_base = redist_base; - redist_regs[count].single_redist = single_redist; + acpi_data.redist_regs[count].phys_base = phys_base; + acpi_data.redist_regs[count].redist_base = redist_base; + acpi_data.redist_regs[count].single_redist = acpi_data.single_redist; count++; } @@ -1010,7 +1013,7 @@ gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header, { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; - u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; + u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2; void __iomem *redist_base; @@ -1027,7 +1030,7 @@ static int __init gic_acpi_collect_gicr_base(void) acpi_tbl_entry_handler redist_parser; enum acpi_madt_type type; - if (single_redist) { + if (acpi_data.single_redist) { type = ACPI_MADT_TYPE_GENERIC_INTERRUPT; redist_parser = gic_acpi_parse_madt_gicc; } else { @@ -1078,14 +1081,14 @@ static int __init gic_acpi_count_gicr_regions(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, gic_acpi_match_gicr, 0); if (count > 0) { - single_redist = false; + acpi_data.single_redist = false; return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_match_gicc, 0); if (count > 0) - single_redist = true; + acpi_data.single_redist = true; return count; } @@ -1105,7 +1108,7 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, if (count <= 0) return false; - nr_redist_regions = count; + acpi_data.nr_redist_regions = count; return true; } @@ -1116,25 +1119,28 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_generic_distributor *dist; struct fwnode_handle *domain_handle; + size_t size; int i, err; /* Get distributor base address */ dist = (struct acpi_madt_generic_distributor *)header; - dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE); - if (!dist_base) { + acpi_data.dist_base = ioremap(dist->base_address, + ACPI_GICV3_DIST_MEM_SIZE); + if (!acpi_data.dist_base) { pr_err("Unable to map GICD registers\n"); return -ENOMEM; } - err = gic_validate_dist_version(dist_base); + err = gic_validate_dist_version(acpi_data.dist_base); if (err) { - pr_err("No distributor detected at @%p, giving up", dist_base); + pr_err("No distributor detected at @%p, giving up", + acpi_data.dist_base); goto out_dist_unmap; } - redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions, - GFP_KERNEL); - if (!redist_regs) { + size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions; + acpi_data.redist_regs = kzalloc(size, GFP_KERNEL); + if (!acpi_data.redist_regs) { err = -ENOMEM; goto out_dist_unmap; } @@ -1143,14 +1149,14 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) if (err) goto out_redist_unmap; - domain_handle = irq_domain_alloc_fwnode(dist_base); + domain_handle = irq_domain_alloc_fwnode(acpi_data.dist_base); if (!domain_handle) { err = -ENOMEM; goto out_redist_unmap; } - err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0, - domain_handle); + err = gic_init_bases(acpi_data.dist_base, acpi_data.redist_regs, + acpi_data.nr_redist_regions, 0, domain_handle); if (err) goto out_fwhandle_free; @@ -1160,12 +1166,12 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) out_fwhandle_free: irq_domain_free_fwnode(domain_handle); out_redist_unmap: - for (i = 0; i < nr_redist_regions; i++) - if (redist_regs[i].redist_base) - iounmap(redist_regs[i].redist_base); - kfree(redist_regs); + for (i = 0; i < acpi_data.nr_redist_regions; i++) + if (acpi_data.redist_regs[i].redist_base) + iounmap(acpi_data.redist_regs[i].redist_base); + kfree(acpi_data.redist_regs); out_dist_unmap: - iounmap(dist_base); + iounmap(acpi_data.dist_base); return err; } IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, From 1839e576968f34b9a31da9f0033f8de12a1c9de6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:57 +0100 Subject: [PATCH 25/29] irqchip/gic-v3: Parse and export virtual GIC information Fill up the recently introduced gic_kvm_info with the hardware information used for virtualization. Signed-off-by: Julien Grall Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Signed-off-by: Christoffer Dall --- drivers/irqchip/irq-gic-v3.c | 114 ++++++++++++++++++++++++- include/linux/irqchip/arm-gic-common.h | 1 + 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 6a9f5ff161a4..05a856073714 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -58,6 +59,8 @@ struct gic_chip_data { static struct gic_chip_data gic_data __read_mostly; static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE; +static struct gic_kvm_info gic_v3_kvm_info; + #define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K) @@ -903,6 +906,30 @@ static int __init gic_validate_dist_version(void __iomem *dist_base) return 0; } +static void __init gic_of_setup_kvm_info(struct device_node *node) +{ + int ret; + struct resource r; + u32 gicv_idx; + + gic_v3_kvm_info.type = GIC_V3; + + gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0); + if (!gic_v3_kvm_info.maint_irq) + return; + + if (of_property_read_u32(node, "#redistributor-regions", + &gicv_idx)) + gicv_idx = 1; + + gicv_idx += 3; /* Also skip GICD, GICC, GICH */ + ret = of_address_to_resource(node, gicv_idx, &r); + if (!ret) + gic_v3_kvm_info.vcpu = r; + + gic_set_kvm_info(&gic_v3_kvm_info); +} + static int __init gic_of_init(struct device_node *node, struct device_node *parent) { void __iomem *dist_base; @@ -954,8 +981,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions, redist_stride, &node->fwnode); - if (!err) + if (!err) { + gic_of_setup_kvm_info(node); return 0; + } out_unmap_rdist: for (i = 0; i < nr_redist_regions; i++) @@ -976,6 +1005,9 @@ static struct struct redist_region *redist_regs; u32 nr_redist_regions; bool single_redist; + u32 maint_irq; + int maint_irq_mode; + phys_addr_t vcpu_base; } acpi_data __initdata; static void __init @@ -1112,7 +1144,85 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, return true; } +static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *gicc = + (struct acpi_madt_generic_interrupt *)header; + int maint_irq_mode; + static int first_madt = true; + + /* Skip unusable CPUs */ + if (!(gicc->flags & ACPI_MADT_ENABLED)) + return 0; + + maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ? + ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE; + + if (first_madt) { + first_madt = false; + + acpi_data.maint_irq = gicc->vgic_interrupt; + acpi_data.maint_irq_mode = maint_irq_mode; + acpi_data.vcpu_base = gicc->gicv_base_address; + + return 0; + } + + /* + * The maintenance interrupt and GICV should be the same for every CPU + */ + if ((acpi_data.maint_irq != gicc->vgic_interrupt) || + (acpi_data.maint_irq_mode != maint_irq_mode) || + (acpi_data.vcpu_base != gicc->gicv_base_address)) + return -EINVAL; + + return 0; +} + +static bool __init gic_acpi_collect_virt_info(void) +{ + int count; + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + gic_acpi_parse_virt_madt_gicc, 0); + + return (count > 0); +} + #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K) +#define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K) +#define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K) + +static void __init gic_acpi_setup_kvm_info(void) +{ + int irq; + + if (!gic_acpi_collect_virt_info()) { + pr_warn("Unable to get hardware information used for virtualization\n"); + return; + } + + gic_v3_kvm_info.type = GIC_V3; + + irq = acpi_register_gsi(NULL, acpi_data.maint_irq, + acpi_data.maint_irq_mode, + ACPI_ACTIVE_HIGH); + if (irq <= 0) + return; + + gic_v3_kvm_info.maint_irq = irq; + + if (acpi_data.vcpu_base) { + struct resource *vcpu = &gic_v3_kvm_info.vcpu; + + vcpu->flags = IORESOURCE_MEM; + vcpu->start = acpi_data.vcpu_base; + vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1; + } + + gic_set_kvm_info(&gic_v3_kvm_info); +} static int __init gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) @@ -1161,6 +1271,8 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) goto out_fwhandle_free; acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle); + gic_acpi_setup_kvm_info(); + return 0; out_fwhandle_free: diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h index ef34f6f35e91..c647b0547bcd 100644 --- a/include/linux/irqchip/arm-gic-common.h +++ b/include/linux/irqchip/arm-gic-common.h @@ -15,6 +15,7 @@ enum gic_type { GIC_V2, + GIC_V3, }; struct gic_kvm_info { From 29c2d6ff4cf9af6bcbba3a76aae1d5cacd5da16b Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:58 +0100 Subject: [PATCH 26/29] KVM: arm/arm64: arch_timer: Rely on the arch timer to parse the firmware tables The firmware table is currently parsed by the virtual timer code in order to retrieve the virtual timer interrupt. However, this is already done by the arch timer driver. To avoid code duplication, use the newly function arch_timer_get_kvm_info() which return all the information required by the virtual timer code. Signed-off-by: Julien Grall Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 40 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 9aaa35dd9144..409db3304471 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -17,7 +17,6 @@ */ #include -#include #include #include #include @@ -438,45 +437,29 @@ static struct notifier_block kvm_timer_cpu_nb = { .notifier_call = kvm_timer_cpu_notify, }; -static const struct of_device_id arch_timer_of_match[] = { - { .compatible = "arm,armv7-timer", }, - { .compatible = "arm,armv8-timer", }, - {}, -}; - int kvm_timer_hyp_init(void) { - struct device_node *np; - unsigned int ppi; + struct arch_timer_kvm_info *info; int err; - timecounter = arch_timer_get_timecounter(); - if (!timecounter) - return -ENODEV; + info = arch_timer_get_kvm_info(); + timecounter = &info->timecounter; - np = of_find_matching_node(NULL, arch_timer_of_match); - if (!np) { - kvm_err("kvm_arch_timer: can't find DT node\n"); + if (info->virtual_irq <= 0) { + kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", + info->virtual_irq); return -ENODEV; } + host_vtimer_irq = info->virtual_irq; - ppi = irq_of_parse_and_map(np, 2); - if (!ppi) { - kvm_err("kvm_arch_timer: no virtual timer interrupt\n"); - err = -EINVAL; - goto out; - } - - err = request_percpu_irq(ppi, kvm_arch_timer_handler, + err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, "kvm guest timer", kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", - ppi, err); + host_vtimer_irq, err); goto out; } - host_vtimer_irq = ppi; - err = __register_cpu_notifier(&kvm_timer_cpu_nb); if (err) { kvm_err("Cannot register timer CPU notifier\n"); @@ -489,14 +472,13 @@ int kvm_timer_hyp_init(void) goto out_free; } - kvm_info("%s IRQ%d\n", np->name, ppi); + kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); on_each_cpu(kvm_timer_init_interrupt, NULL, 1); goto out; out_free: - free_percpu_irq(ppi, kvm_get_running_vcpus()); + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); out: - of_node_put(np); return err; } From 503a62862e8fc08b2d4c0db06ba3adae3bbd61bc Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:32:59 +0100 Subject: [PATCH 27/29] KVM: arm/arm64: vgic: Rely on the GIC driver to parse the firmware tables Currently, the firmware tables are parsed 2 times: once in the GIC drivers, the other time when initializing the vGIC. It means code duplication and make more tedious to add the support for another firmware table (like ACPI). Use the recently introduced helper gic_get_kvm_info() to get information about the virtual GIC. With this change, the virtual GIC becomes agnostic to the firmware table and KVM will be able to initialize the vGIC on ACPI. Signed-off-by: Julien Grall Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 7 +++-- virt/kvm/arm/vgic-v2.c | 67 ++++++++++++++++++------------------------ virt/kvm/arm/vgic-v3.c | 47 ++++++++++------------------- virt/kvm/arm/vgic.c | 52 +++++++++++++++++--------------- 4 files changed, 77 insertions(+), 96 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 281caf847fad..be6037aa703d 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -25,6 +25,7 @@ #include #include #include +#include #define VGIC_NR_IRQS_LEGACY 256 #define VGIC_NR_SGIS 16 @@ -353,15 +354,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map); #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) #define vgic_ready(k) ((k)->arch.vgic.ready) -int vgic_v2_probe(struct device_node *vgic_node, +int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params); #ifdef CONFIG_KVM_ARM_VGIC_V3 -int vgic_v3_probe(struct device_node *vgic_node, +int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params); #else -static inline int vgic_v3_probe(struct device_node *vgic_node, +static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params) { diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index 67ec334ce1d0..7e826c9b2b0a 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -20,9 +20,6 @@ #include #include #include -#include -#include -#include #include @@ -186,38 +183,39 @@ static void vgic_cpu_init_lrs(void *params) } /** - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT - * @node: pointer to the DT node - * @ops: address of a pointer to the GICv2 operations - * @params: address of a pointer to HW-specific parameters + * vgic_v2_probe - probe for a GICv2 compatible interrupt controller + * @gic_kvm_info: pointer to the GIC description + * @ops: address of a pointer to the GICv2 operations + * @params: address of a pointer to HW-specific parameters * * Returns 0 if a GICv2 has been found, with the low level operations * in *ops and the HW parameters in *params. Returns an error code * otherwise. */ -int vgic_v2_probe(struct device_node *vgic_node, - const struct vgic_ops **ops, - const struct vgic_params **params) +int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, + const struct vgic_ops **ops, + const struct vgic_params **params) { int ret; - struct resource vctrl_res; - struct resource vcpu_res; struct vgic_params *vgic = &vgic_v2_params; + const struct resource *vctrl_res = &gic_kvm_info->vctrl; + const struct resource *vcpu_res = &gic_kvm_info->vcpu; - vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0); - if (!vgic->maint_irq) { - kvm_err("error getting vgic maintenance irq from DT\n"); + if (!gic_kvm_info->maint_irq) { + kvm_err("error getting vgic maintenance irq\n"); + ret = -ENXIO; + goto out; + } + vgic->maint_irq = gic_kvm_info->maint_irq; + + if (!gic_kvm_info->vctrl.start) { + kvm_err("GICH not present in the firmware table\n"); ret = -ENXIO; goto out; } - ret = of_address_to_resource(vgic_node, 2, &vctrl_res); - if (ret) { - kvm_err("Cannot obtain GICH resource\n"); - goto out; - } - - vgic->vctrl_base = of_iomap(vgic_node, 2); + vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start, + resource_size(&gic_kvm_info->vctrl)); if (!vgic->vctrl_base) { kvm_err("Cannot ioremap GICH\n"); ret = -ENOMEM; @@ -228,29 +226,23 @@ int vgic_v2_probe(struct device_node *vgic_node, vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1; ret = create_hyp_io_mappings(vgic->vctrl_base, - vgic->vctrl_base + resource_size(&vctrl_res), - vctrl_res.start); + vgic->vctrl_base + resource_size(vctrl_res), + vctrl_res->start); if (ret) { kvm_err("Cannot map VCTRL into hyp\n"); goto out_unmap; } - if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { - kvm_err("Cannot obtain GICV resource\n"); - ret = -ENXIO; - goto out_unmap; - } - - if (!PAGE_ALIGNED(vcpu_res.start)) { + if (!PAGE_ALIGNED(vcpu_res->start)) { kvm_err("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res.start); + (unsigned long long)vcpu_res->start); ret = -ENXIO; goto out_unmap; } - if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { + if (!PAGE_ALIGNED(resource_size(vcpu_res))) { kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&vcpu_res), + (unsigned long long)resource_size(vcpu_res), PAGE_SIZE); ret = -ENXIO; goto out_unmap; @@ -259,10 +251,10 @@ int vgic_v2_probe(struct device_node *vgic_node, vgic->can_emulate_gicv2 = true; kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); - vgic->vcpu_base = vcpu_res.start; + vgic->vcpu_base = vcpu_res->start; - kvm_info("%s@%llx IRQ%d\n", vgic_node->name, - vctrl_res.start, vgic->maint_irq); + kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n", + gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq); vgic->type = VGIC_V2; vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS; @@ -276,6 +268,5 @@ int vgic_v2_probe(struct device_node *vgic_node, out_unmap: iounmap(vgic->vctrl_base); out: - of_node_put(vgic_node); return ret; } diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 999bdc6d9d9f..c02a1b1cf855 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -20,11 +20,9 @@ #include #include #include -#include -#include -#include #include +#include #include #include @@ -222,30 +220,24 @@ static void vgic_cpu_init_lrs(void *params) } /** - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT - * @node: pointer to the DT node - * @ops: address of a pointer to the GICv3 operations - * @params: address of a pointer to HW-specific parameters + * vgic_v3_probe - probe for a GICv3 compatible interrupt controller + * @gic_kvm_info: pointer to the GIC description + * @ops: address of a pointer to the GICv3 operations + * @params: address of a pointer to HW-specific parameters * * Returns 0 if a GICv3 has been found, with the low level operations * in *ops and the HW parameters in *params. Returns an error code * otherwise. */ -int vgic_v3_probe(struct device_node *vgic_node, +int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, const struct vgic_params **params) { int ret = 0; - u32 gicv_idx; - struct resource vcpu_res; struct vgic_params *vgic = &vgic_v3_params; + const struct resource *vcpu_res = &gic_kvm_info->vcpu; - vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0); - if (!vgic->maint_irq) { - kvm_err("error getting vgic maintenance irq from DT\n"); - ret = -ENXIO; - goto out; - } + vgic->maint_irq = gic_kvm_info->maint_irq; ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); @@ -256,24 +248,19 @@ int vgic_v3_probe(struct device_node *vgic_node, vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1; vgic->can_emulate_gicv2 = false; - if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx)) - gicv_idx = 1; - - gicv_idx += 3; /* Also skip GICD, GICC, GICH */ - if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) { + if (!vcpu_res->start) { kvm_info("GICv3: no GICV resource entry\n"); vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(vcpu_res.start)) { + } else if (!PAGE_ALIGNED(vcpu_res->start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res.start); + (unsigned long long)vcpu_res->start); vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { + } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) { pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&vcpu_res), + (unsigned long long)resource_size(vcpu_res), PAGE_SIZE); - vgic->vcpu_base = 0; } else { - vgic->vcpu_base = vcpu_res.start; + vgic->vcpu_base = vcpu_res->start; vgic->can_emulate_gicv2 = true; kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); @@ -286,15 +273,13 @@ int vgic_v3_probe(struct device_node *vgic_node, vgic->type = VGIC_V3; vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS; - kvm_info("%s@%llx IRQ%d\n", vgic_node->name, - vcpu_res.start, vgic->maint_irq); + kvm_info("GICV base=0x%llx, IRQ=%d\n", + vgic->vcpu_base, vgic->maint_irq); on_each_cpu(vgic_cpu_init_lrs, vgic, 1); *ops = &vgic_v3_ops; *params = vgic; -out: - of_node_put(vgic_node); return ret; } diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 00429b392c61..60668a7f319a 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -21,9 +21,7 @@ #include #include #include -#include -#include -#include +#include #include #include @@ -33,6 +31,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -2389,33 +2388,38 @@ static struct notifier_block vgic_cpu_nb = { .notifier_call = vgic_cpu_notify, }; -static const struct of_device_id vgic_ids[] = { - { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, }, - { .compatible = "arm,cortex-a7-gic", .data = vgic_v2_probe, }, - { .compatible = "arm,gic-400", .data = vgic_v2_probe, }, - { .compatible = "arm,gic-v3", .data = vgic_v3_probe, }, - {}, -}; +static int kvm_vgic_probe(void) +{ + const struct gic_kvm_info *gic_kvm_info; + int ret; + + gic_kvm_info = gic_get_kvm_info(); + if (!gic_kvm_info) + return -ENODEV; + + switch (gic_kvm_info->type) { + case GIC_V2: + ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic); + break; + case GIC_V3: + ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic); + break; + default: + ret = -ENODEV; + } + + return ret; +} int kvm_vgic_hyp_init(void) { - const struct of_device_id *matched_id; - const int (*vgic_probe)(struct device_node *,const struct vgic_ops **, - const struct vgic_params **); - struct device_node *vgic_node; int ret; - vgic_node = of_find_matching_node_and_match(NULL, - vgic_ids, &matched_id); - if (!vgic_node) { - kvm_err("error: no compatible GIC node found\n"); - return -ENODEV; - } - - vgic_probe = matched_id->data; - ret = vgic_probe(vgic_node, &vgic_ops, &vgic); - if (ret) + ret = kvm_vgic_probe(); + if (ret) { + kvm_err("error: KVM vGIC probing failed\n"); return ret; + } ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, "vgic", kvm_get_running_vcpus()); From a53d892dfb6f14f77c508e1027f5e1bdb400fd23 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 Apr 2016 16:33:00 +0100 Subject: [PATCH 28/29] clocksource: arm_arch_timer: Remove arch_timer_get_timecounter The only call of arch_timer_get_timecounter (in KVM) has been removed. Signed-off-by: Julien Grall Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- drivers/clocksource/arm_arch_timer.c | 5 ----- include/clocksource/arm_arch_timer.h | 6 ------ 2 files changed, 11 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index bb58224eea36..4814446a0024 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -475,11 +475,6 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) return &arch_timer_kvm_info; } -struct timecounter *arch_timer_get_timecounter(void) -{ - return &arch_timer_kvm_info.timecounter; -} - static void __init arch_counter_register(unsigned type) { u64 start_count; diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 9dd996a67e9e..caedb74c9210 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -58,7 +58,6 @@ struct arch_timer_kvm_info { extern u32 arch_timer_get_rate(void); extern u64 (*arch_timer_read_counter)(void); -extern struct timecounter *arch_timer_get_timecounter(void); extern struct arch_timer_kvm_info *arch_timer_get_kvm_info(void); #else @@ -73,11 +72,6 @@ static inline u64 arch_timer_read_counter(void) return 0; } -static inline struct timecounter *arch_timer_get_timecounter(void) -{ - return NULL; -} - #endif #endif From 06485053244480f5f403d8f89b8617bd7d549113 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 13 Apr 2016 17:57:37 +0100 Subject: [PATCH 29/29] kvm: arm64: Enable hardware updates of the Access Flag for Stage 2 page tables The ARMv8.1 architecture extensions introduce support for hardware updates of the access and dirty information in page table entries. With VTCR_EL2.HA enabled (bit 21), when the CPU accesses an IPA with the PTE_AF bit cleared in the stage 2 page table, instead of raising an Access Flag fault to EL2 the CPU sets the actual page table entry bit (10). To ensure that kernel modifications to the page table do not inadvertently revert a bit set by hardware updates, certain Stage 2 software pte/pmd operations must be performed atomically. The main user of the AF bit is the kvm_age_hva() mechanism. The kvm_age_hva_handler() function performs a "test and clear young" action on the pte/pmd. This needs to be atomic in respect of automatic hardware updates of the AF bit. Since the AF bit is in the same position for both Stage 1 and Stage 2, the patch reuses the existing ptep_test_and_clear_young() functionality if __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG is defined. Otherwise, the existing pte_young/pte_mkold mechanism is preserved. The kvm_set_s2pte_readonly() (and the corresponding pmd equivalent) have to perform atomic modifications in order to avoid a race with updates of the AF bit. The arm64 implementation has been re-written using exclusives. Currently, kvm_set_s2pte_writable() (and pmd equivalent) take a pointer argument and modify the pte/pmd in place. However, these functions are only used on local variables rather than actual page table entries, so it makes more sense to follow the pte_mkwrite() approach for stage 1 attributes. The change to kvm_s2pte_mkwrite() makes it clear that these functions do not modify the actual page table entries. The (pte|pmd)_mkyoung() uses on Stage 2 entries (setting the AF bit explicitly) do not need to be modified since hardware updates of the dirty status are not supported by KVM, so there is no possibility of losing such information. Signed-off-by: Catalin Marinas Cc: Paolo Bonzini Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 10 ++++--- arch/arm/kvm/mmu.c | 46 ++++++++++++++++++++------------ arch/arm64/include/asm/kvm_arm.h | 2 ++ arch/arm64/include/asm/kvm_mmu.h | 27 ++++++++++++++----- arch/arm64/include/asm/pgtable.h | 13 ++++++--- arch/arm64/kvm/hyp/s2-setup.c | 8 ++++++ 6 files changed, 74 insertions(+), 32 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 6344ea0ad624..ef0b276d97fc 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -106,14 +106,16 @@ static inline void kvm_clean_pte(pte_t *pte) clean_pte_table(pte); } -static inline void kvm_set_s2pte_writable(pte_t *pte) +static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { - pte_val(*pte) |= L_PTE_S2_RDWR; + pte_val(pte) |= L_PTE_S2_RDWR; + return pte; } -static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) { - pmd_val(*pmd) |= L_PMD_S2_RDWR; + pmd_val(pmd) |= L_PMD_S2_RDWR; + return pmd; } static inline void kvm_set_s2pte_readonly(pte_t *pte) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 74b5d199f6b7..783e5ff0b32e 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -977,6 +977,27 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); + return 1; + } + return 0; +} +#else +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return __ptep_test_and_clear_young(pte); +} +#endif + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pmd); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -1000,7 +1021,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) - kvm_set_s2pte_writable(&pte); + pte = kvm_s2pte_mkwrite(pte); ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, KVM_NR_MEM_OBJS); @@ -1342,7 +1363,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) { - kvm_set_s2pmd_writable(&new_pmd); + new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); @@ -1351,7 +1372,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pte_t new_pte = pfn_pte(pfn, mem_type); if (writable) { - kvm_set_s2pte_writable(&new_pte); + new_pte = kvm_s2pte_mkwrite(new_pte); kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } @@ -1370,6 +1391,8 @@ out_unlock: * Resolve the access fault by making the page young again. * Note that because the faulting entry is guaranteed not to be * cached in the TLB, we don't need to invalidate anything. + * Only the HW Access Flag updates are supported for Stage 2 (no DBM), + * so there is no need for atomic (pte|pmd)_mkyoung operations. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { @@ -1610,25 +1633,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ - if (pmd_young(*pmd)) { - *pmd = pmd_mkold(*pmd); - return 1; - } - - return 0; - } + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + return stage2_pmdp_test_and_clear_young(pmd); pte = pte_offset_kernel(pmd, gpa); if (pte_none(*pte)) return 0; - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); /* Just a page... */ - return 1; - } - - return 0; + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index c6cbb361bbcf..ffde15fed3e1 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -111,6 +111,8 @@ /* VTCR_EL2 Registers bits */ #define VTCR_EL2_RES1 (1 << 31) +#define VTCR_EL2_HD (1 << 22) +#define VTCR_EL2_HA (1 << 21) #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK #define VTCR_EL2_TG0_MASK TCR_TG0_MASK #define VTCR_EL2_TG0_4K TCR_TG0_4K diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 249c4fc9c5f6..844fe5d5ff44 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -111,19 +111,32 @@ static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} static inline void kvm_clean_pte(pte_t *pte) {} static inline void kvm_clean_pte_entry(pte_t *pte) {} -static inline void kvm_set_s2pte_writable(pte_t *pte) +static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { - pte_val(*pte) |= PTE_S2_RDWR; + pte_val(pte) |= PTE_S2_RDWR; + return pte; } -static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) { - pmd_val(*pmd) |= PMD_S2_RDWR; + pmd_val(pmd) |= PMD_S2_RDWR; + return pmd; } static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY; + pteval_t pteval; + unsigned long tmp; + + asm volatile("// kvm_set_s2pte_readonly\n" + " prfm pstl1strm, %2\n" + "1: ldxr %0, %2\n" + " and %0, %0, %3 // clear PTE_S2_RDWR\n" + " orr %0, %0, %4 // set PTE_S2_RDONLY\n" + " stxr %w1, %0, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) + : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); } static inline bool kvm_s2pte_readonly(pte_t *pte) @@ -133,12 +146,12 @@ static inline bool kvm_s2pte_readonly(pte_t *pte) static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) { - pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY; + kvm_set_s2pte_readonly((pte_t *)pmd); } static inline bool kvm_s2pmd_readonly(pmd_t *pmd) { - return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY; + return kvm_s2pte_readonly((pte_t *)pmd); } static inline bool kvm_page_empty(void *ptr) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index dda4aa9ba3f8..f1d5afdb12db 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -532,14 +532,12 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) * Atomic pte/pmd modifications. */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline int __ptep_test_and_clear_young(pte_t *ptep) { pteval_t pteval; unsigned int tmp, res; - asm volatile("// ptep_test_and_clear_young\n" + asm volatile("// __ptep_test_and_clear_young\n" " prfm pstl1strm, %2\n" "1: ldxr %0, %2\n" " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" @@ -552,6 +550,13 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, return res; } +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + return __ptep_test_and_clear_young(ptep); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index bcbe761a5a3d..b81f4091c909 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -65,6 +65,14 @@ u32 __hyp_text __init_stage2_translation(void) */ val |= 64 - (parange > 40 ? 40 : parange); + /* + * Check the availability of Hardware Access Flag / Dirty Bit + * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. + */ + tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; + if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + val |= VTCR_EL2_HA; + /* * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS * bit in VTCR_EL2.