diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 43876f16caf1..cb00ccc7d571 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -47,16 +47,26 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd * operations. * - * Without THP if the mmap_sem is hold for reading, the - * pmd can only transition from null to not null while pmd_read_atomic runs. - * So there's no need of literally reading it atomically. + * Without THP if the mmap_sem is hold for reading, the pmd can only + * transition from null to not null while pmd_read_atomic runs. So + * we can always return atomic pmd values with this function. * * With THP if the mmap_sem is hold for reading, the pmd can become - * THP or null or point to a pte (and in turn become "stable") at any - * time under pmd_read_atomic, so it's mandatory to read it atomically - * with cmpxchg8b. + * trans_huge or none or point to a pte (and in turn become "stable") + * at any time under pmd_read_atomic. We could read it really + * atomically here with a atomic64_read for the THP enabled case (and + * it would be a whole lot simpler), but to avoid using cmpxchg8b we + * only return an atomic pmdval if the low part of the pmdval is later + * found stable (i.e. pointing to a pte). And we're returning a none + * pmdval if the low part of the pmd is none. In some cases the high + * and low part of the pmdval returned may not be consistent if THP is + * enabled (the low part may point to previously mapped hugepage, + * while the high part may point to a more recently mapped hugepage), + * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part + * of the pmd to be read atomically to decide if the pmd is unstable + * or not, with the only exception of when the low part of the pmd is + * zero in which case we return a none pmd. */ -#ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { pmdval_t ret; @@ -74,12 +84,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) return (pmd_t) { ret }; } -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6f2b45a9b6bc..ff4947b7a976 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -484,6 +484,16 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) /* * The barrier will stabilize the pmdval in a register or on * the stack so that it will stop changing under the code. + * + * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, + * pmd_read_atomic is allowed to return a not atomic pmdval + * (for example pointing to an hugepage that has never been + * mapped in the pmd). The below checks will only care about + * the low part of the pmd with 32bit PAE x86 anyway, with the + * exception of pmd_none(). So the important thing is that if + * the low part of the pmd is found null, the high part will + * be also null or the pmd_none() check below would be + * confused. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE barrier();