Merge branch 'akpm' (patches from Andrew)
Merge small final update from Andrew Morton: - DAX feature work: add fsync/msync support - kfree cleanup, MAINTAINERS update * emailed patches from Andrew Morton <akpm@linux-foundation.org>: MAINTAINERS: return arch/sh to maintained state, with new maintainers tree wide: use kvfree() than conditional kfree()/vfree() dax: never rely on bh.b_dev being set by get_block() xfs: call dax_pfn_mkwrite() for DAX fsync/msync ext4: call dax_pfn_mkwrite() for DAX fsync/msync ext2: call dax_pfn_mkwrite() for DAX fsync/msync dax: add support for fsync/sync mm: add find_get_entries_tag() dax: support dirty DAX entries in radix tree pmem: add wb_cache_pmem() to the PMEM API dax: fix conversion of holes to PMDs dax: fix NULL pointer dereference in __dax_dbg()
This commit is contained in:
Коммит
20c759ca98
|
@ -10453,9 +10453,11 @@ S: Maintained
|
|||
F: drivers/net/ethernet/dlink/sundance.c
|
||||
|
||||
SUPERH
|
||||
M: Yoshinori Sato <ysato@users.sourceforge.jp>
|
||||
M: Rich Felker <dalias@libc.org>
|
||||
L: linux-sh@vger.kernel.org
|
||||
Q: http://patchwork.kernel.org/project/linux-sh/list/
|
||||
S: Orphan
|
||||
S: Maintained
|
||||
F: Documentation/sh/
|
||||
F: arch/sh/
|
||||
F: drivers/sh/
|
||||
|
|
|
@ -1200,10 +1200,7 @@ error:
|
|||
while (i--)
|
||||
if (pages[i])
|
||||
__free_pages(pages[i], 0);
|
||||
if (array_size <= PAGE_SIZE)
|
||||
kfree(pages);
|
||||
else
|
||||
vfree(pages);
|
||||
kvfree(pages);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
|
|||
size_t size, struct dma_attrs *attrs)
|
||||
{
|
||||
int count = size >> PAGE_SHIFT;
|
||||
int array_size = count * sizeof(struct page *);
|
||||
int i;
|
||||
|
||||
if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
|
||||
|
@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
|
|||
__free_pages(pages[i], 0);
|
||||
}
|
||||
|
||||
if (array_size <= PAGE_SIZE)
|
||||
kfree(pages);
|
||||
else
|
||||
vfree(pages);
|
||||
kvfree(pages);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void)
|
|||
}
|
||||
|
||||
/**
|
||||
* __arch_wb_cache_pmem - write back a cache range with CLWB
|
||||
* arch_wb_cache_pmem - write back a cache range with CLWB
|
||||
* @vaddr: virtual start address
|
||||
* @size: number of bytes to write back
|
||||
*
|
||||
* Write back a cache range using the CLWB (cache line write back)
|
||||
* instruction. This function requires explicit ordering with an
|
||||
* arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation.
|
||||
* arch_wmb_pmem() call.
|
||||
*/
|
||||
static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
|
||||
static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
|
||||
{
|
||||
u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
|
||||
unsigned long clflush_mask = x86_clflush_size - 1;
|
||||
void *vaddr = (void __force *)addr;
|
||||
void *vend = vaddr + size;
|
||||
void *p;
|
||||
|
||||
|
@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
|
|||
len = copy_from_iter_nocache(vaddr, bytes, i);
|
||||
|
||||
if (__iter_needs_pmem_wb(i))
|
||||
__arch_wb_cache_pmem(vaddr, bytes);
|
||||
arch_wb_cache_pmem(addr, bytes);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
|
|||
void *vaddr = (void __force *)addr;
|
||||
|
||||
memset(vaddr, 0, size);
|
||||
__arch_wb_cache_pmem(vaddr, size);
|
||||
arch_wb_cache_pmem(addr, size);
|
||||
}
|
||||
|
||||
static inline bool __arch_has_wmb_pmem(void)
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#include <linux/hardirq.h>
|
||||
#include <linux/pstore.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/mm.h> /* kvfree() */
|
||||
#include <acpi/apei.h>
|
||||
|
||||
#include "apei-internal.h"
|
||||
|
@ -532,10 +533,7 @@ retry:
|
|||
return -ENOMEM;
|
||||
memcpy(new_entries, entries,
|
||||
erst_record_id_cache.len * sizeof(entries[0]));
|
||||
if (erst_record_id_cache.size < PAGE_SIZE)
|
||||
kfree(entries);
|
||||
else
|
||||
vfree(entries);
|
||||
kvfree(entries);
|
||||
erst_record_id_cache.entries = entries = new_entries;
|
||||
erst_record_id_cache.size = new_size;
|
||||
}
|
||||
|
|
|
@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number)
|
|||
}
|
||||
}
|
||||
|
||||
static void bm_vk_free(void *ptr, int v)
|
||||
static inline void bm_vk_free(void *ptr)
|
||||
{
|
||||
if (v)
|
||||
vfree(ptr);
|
||||
else
|
||||
kfree(ptr);
|
||||
kvfree(ptr);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
|
|||
{
|
||||
struct page **old_pages = b->bm_pages;
|
||||
struct page **new_pages, *page;
|
||||
unsigned int i, bytes, vmalloced = 0;
|
||||
unsigned int i, bytes;
|
||||
unsigned long have = b->bm_number_of_pages;
|
||||
|
||||
BUG_ON(have == 0 && old_pages != NULL);
|
||||
|
@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
|
|||
PAGE_KERNEL);
|
||||
if (!new_pages)
|
||||
return NULL;
|
||||
vmalloced = 1;
|
||||
}
|
||||
|
||||
if (want >= have) {
|
||||
|
@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
|
|||
page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
|
||||
if (!page) {
|
||||
bm_free_pages(new_pages + have, i - have);
|
||||
bm_vk_free(new_pages, vmalloced);
|
||||
bm_vk_free(new_pages);
|
||||
return NULL;
|
||||
}
|
||||
/* we want to know which page it is
|
||||
|
@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
|
|||
*/
|
||||
}
|
||||
|
||||
if (vmalloced)
|
||||
b->bm_flags |= BM_P_VMALLOCED;
|
||||
else
|
||||
b->bm_flags &= ~BM_P_VMALLOCED;
|
||||
|
||||
return new_pages;
|
||||
}
|
||||
|
||||
|
@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device)
|
|||
if (!expect(device->bitmap))
|
||||
return;
|
||||
bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
|
||||
bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
|
||||
bm_vk_free(device->bitmap->bm_pages);
|
||||
kfree(device->bitmap);
|
||||
device->bitmap = NULL;
|
||||
}
|
||||
|
@ -643,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
|
|||
unsigned long want, have, onpages; /* number of pages */
|
||||
struct page **npages, **opages = NULL;
|
||||
int err = 0, growing;
|
||||
int opages_vmalloced;
|
||||
|
||||
if (!expect(b))
|
||||
return -ENOMEM;
|
||||
|
@ -656,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
|
|||
if (capacity == b->bm_dev_capacity)
|
||||
goto out;
|
||||
|
||||
opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
|
||||
|
||||
if (capacity == 0) {
|
||||
spin_lock_irq(&b->bm_lock);
|
||||
opages = b->bm_pages;
|
||||
|
@ -671,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
|
|||
b->bm_dev_capacity = 0;
|
||||
spin_unlock_irq(&b->bm_lock);
|
||||
bm_free_pages(opages, onpages);
|
||||
bm_vk_free(opages, opages_vmalloced);
|
||||
bm_vk_free(opages);
|
||||
goto out;
|
||||
}
|
||||
bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
|
||||
|
@ -744,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
|
|||
|
||||
spin_unlock_irq(&b->bm_lock);
|
||||
if (opages != npages)
|
||||
bm_vk_free(opages, opages_vmalloced);
|
||||
bm_vk_free(opages);
|
||||
if (!growing)
|
||||
b->bm_set = bm_count_bits(b);
|
||||
drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
|
||||
|
|
|
@ -536,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */
|
|||
/* definition of bits in bm_flags to be used in drbd_bm_lock
|
||||
* and drbd_bitmap_io and friends. */
|
||||
enum bm_flag {
|
||||
/* do we need to kfree, or vfree bm_pages? */
|
||||
BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
|
||||
|
||||
/* currently locked for bulk operation */
|
||||
BM_LOCKED_MASK = 0xf,
|
||||
|
||||
|
|
|
@ -93,14 +93,11 @@ struct vma_data {
|
|||
spinlock_t lock; /* Serialize access to this structure. */
|
||||
int count; /* Number of pages allocated. */
|
||||
enum mspec_page_type type; /* Type of pages allocated. */
|
||||
int flags; /* See VMD_xxx below. */
|
||||
unsigned long vm_start; /* Original (unsplit) base. */
|
||||
unsigned long vm_end; /* Original (unsplit) end. */
|
||||
unsigned long maddr[0]; /* Array of MSPEC addresses. */
|
||||
};
|
||||
|
||||
#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */
|
||||
|
||||
/* used on shub2 to clear FOP cache in the HUB */
|
||||
static unsigned long scratch_page[MAX_NUMNODES];
|
||||
#define SH2_AMO_CACHE_ENTRIES 4
|
||||
|
@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma)
|
|||
"failed to zero page %ld\n", my_page);
|
||||
}
|
||||
|
||||
if (vdata->flags & VMD_VMALLOCED)
|
||||
vfree(vdata);
|
||||
else
|
||||
kfree(vdata);
|
||||
kvfree(vdata);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
|
|||
enum mspec_page_type type)
|
||||
{
|
||||
struct vma_data *vdata;
|
||||
int pages, vdata_size, flags = 0;
|
||||
int pages, vdata_size;
|
||||
|
||||
if (vma->vm_pgoff != 0)
|
||||
return -EINVAL;
|
||||
|
@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
|
|||
vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
|
||||
if (vdata_size <= PAGE_SIZE)
|
||||
vdata = kzalloc(vdata_size, GFP_KERNEL);
|
||||
else {
|
||||
else
|
||||
vdata = vzalloc(vdata_size);
|
||||
flags = VMD_VMALLOCED;
|
||||
}
|
||||
if (!vdata)
|
||||
return -ENOMEM;
|
||||
|
||||
vdata->vm_start = vma->vm_start;
|
||||
vdata->vm_end = vma->vm_end;
|
||||
vdata->flags = flags;
|
||||
vdata->type = type;
|
||||
spin_lock_init(&vdata->lock);
|
||||
atomic_set(&vdata->refcnt, 1);
|
||||
|
|
|
@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item);
|
|||
void drm_ht_remove(struct drm_open_hash *ht)
|
||||
{
|
||||
if (ht->table) {
|
||||
if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order)
|
||||
kfree(ht->table);
|
||||
else
|
||||
vfree(ht->table);
|
||||
kvfree(ht->table);
|
||||
ht->table = NULL;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -151,16 +151,12 @@ do { \
|
|||
|
||||
#define LIBCFS_FREE(ptr, size) \
|
||||
do { \
|
||||
int s = (size); \
|
||||
if (unlikely((ptr) == NULL)) { \
|
||||
CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \
|
||||
"%s:%d\n", s, __FILE__, __LINE__); \
|
||||
"%s:%d\n", (int)(size), __FILE__, __LINE__); \
|
||||
break; \
|
||||
} \
|
||||
if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \
|
||||
vfree(ptr); \
|
||||
else \
|
||||
kfree(ptr); \
|
||||
kvfree(ptr); \
|
||||
} while (0)
|
||||
|
||||
/******************************************************************************/
|
||||
|
|
|
@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
|
|||
{
|
||||
struct address_space *mapping = bdev->bd_inode->i_mapping;
|
||||
|
||||
if (mapping->nrpages == 0 && mapping->nrshadows == 0)
|
||||
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
||||
return;
|
||||
|
||||
invalidate_bh_lrus();
|
||||
|
|
|
@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
|
|||
} while (0)
|
||||
|
||||
|
||||
#define CODA_FREE(ptr,size) \
|
||||
do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
|
||||
#define CODA_FREE(ptr, size) kvfree((ptr))
|
||||
|
||||
/* inode to cnode access functions */
|
||||
|
||||
|
|
274
fs/dax.c
274
fs/dax.c
|
@ -24,6 +24,7 @@
|
|||
#include <linux/memcontrol.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/pmem.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/uio.h>
|
||||
|
@ -245,6 +246,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
|
|||
loff_t end = pos + iov_iter_count(iter);
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
|
||||
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
|
@ -324,6 +326,199 @@ static int copy_user_bh(struct page *to, struct inode *inode,
|
|||
return 0;
|
||||
}
|
||||
|
||||
#define NO_SECTOR -1
|
||||
#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
|
||||
|
||||
static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
|
||||
sector_t sector, bool pmd_entry, bool dirty)
|
||||
{
|
||||
struct radix_tree_root *page_tree = &mapping->page_tree;
|
||||
pgoff_t pmd_index = DAX_PMD_INDEX(index);
|
||||
int type, error = 0;
|
||||
void *entry;
|
||||
|
||||
WARN_ON_ONCE(pmd_entry && !dirty);
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
|
||||
entry = radix_tree_lookup(page_tree, pmd_index);
|
||||
if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
|
||||
index = pmd_index;
|
||||
goto dirty;
|
||||
}
|
||||
|
||||
entry = radix_tree_lookup(page_tree, index);
|
||||
if (entry) {
|
||||
type = RADIX_DAX_TYPE(entry);
|
||||
if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
|
||||
type != RADIX_DAX_PMD)) {
|
||||
error = -EIO;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (!pmd_entry || type == RADIX_DAX_PMD)
|
||||
goto dirty;
|
||||
|
||||
/*
|
||||
* We only insert dirty PMD entries into the radix tree. This
|
||||
* means we don't need to worry about removing a dirty PTE
|
||||
* entry and inserting a clean PMD entry, thus reducing the
|
||||
* range we would flush with a follow-up fsync/msync call.
|
||||
*/
|
||||
radix_tree_delete(&mapping->page_tree, index);
|
||||
mapping->nrexceptional--;
|
||||
}
|
||||
|
||||
if (sector == NO_SECTOR) {
|
||||
/*
|
||||
* This can happen during correct operation if our pfn_mkwrite
|
||||
* fault raced against a hole punch operation. If this
|
||||
* happens the pte that was hole punched will have been
|
||||
* unmapped and the radix tree entry will have been removed by
|
||||
* the time we are called, but the call will still happen. We
|
||||
* will return all the way up to wp_pfn_shared(), where the
|
||||
* pte_same() check will fail, eventually causing page fault
|
||||
* to be retried by the CPU.
|
||||
*/
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
error = radix_tree_insert(page_tree, index,
|
||||
RADIX_DAX_ENTRY(sector, pmd_entry));
|
||||
if (error)
|
||||
goto unlock;
|
||||
|
||||
mapping->nrexceptional++;
|
||||
dirty:
|
||||
if (dirty)
|
||||
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
|
||||
unlock:
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int dax_writeback_one(struct block_device *bdev,
|
||||
struct address_space *mapping, pgoff_t index, void *entry)
|
||||
{
|
||||
struct radix_tree_root *page_tree = &mapping->page_tree;
|
||||
int type = RADIX_DAX_TYPE(entry);
|
||||
struct radix_tree_node *node;
|
||||
struct blk_dax_ctl dax;
|
||||
void **slot;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
/*
|
||||
* Regular page slots are stabilized by the page lock even
|
||||
* without the tree itself locked. These unlocked entries
|
||||
* need verification under the tree lock.
|
||||
*/
|
||||
if (!__radix_tree_lookup(page_tree, index, &node, &slot))
|
||||
goto unlock;
|
||||
if (*slot != entry)
|
||||
goto unlock;
|
||||
|
||||
/* another fsync thread may have already written back this entry */
|
||||
if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
|
||||
goto unlock;
|
||||
|
||||
if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
|
||||
ret = -EIO;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
dax.sector = RADIX_DAX_SECTOR(entry);
|
||||
dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
|
||||
/*
|
||||
* We cannot hold tree_lock while calling dax_map_atomic() because it
|
||||
* eventually calls cond_resched().
|
||||
*/
|
||||
ret = dax_map_atomic(bdev, &dax);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (WARN_ON_ONCE(ret < dax.size)) {
|
||||
ret = -EIO;
|
||||
goto unmap;
|
||||
}
|
||||
|
||||
wb_cache_pmem(dax.addr, dax.size);
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
unmap:
|
||||
dax_unmap_atomic(bdev, &dax);
|
||||
return ret;
|
||||
|
||||
unlock:
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush the mapping to the persistent domain within the byte range of [start,
|
||||
* end]. This is required by data integrity operations to ensure file data is
|
||||
* on persistent storage prior to completion of the operation.
|
||||
*/
|
||||
int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
|
||||
loff_t end)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct block_device *bdev = inode->i_sb->s_bdev;
|
||||
pgoff_t start_index, end_index, pmd_index;
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
struct pagevec pvec;
|
||||
bool done = false;
|
||||
int i, ret = 0;
|
||||
void *entry;
|
||||
|
||||
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
|
||||
return -EIO;
|
||||
|
||||
start_index = start >> PAGE_CACHE_SHIFT;
|
||||
end_index = end >> PAGE_CACHE_SHIFT;
|
||||
pmd_index = DAX_PMD_INDEX(start_index);
|
||||
|
||||
rcu_read_lock();
|
||||
entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* see if the start of our range is covered by a PMD entry */
|
||||
if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
|
||||
start_index = pmd_index;
|
||||
|
||||
tag_pages_for_writeback(mapping, start_index, end_index);
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
while (!done) {
|
||||
pvec.nr = find_get_entries_tag(mapping, start_index,
|
||||
PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
|
||||
pvec.pages, indices);
|
||||
|
||||
if (pvec.nr == 0)
|
||||
break;
|
||||
|
||||
for (i = 0; i < pvec.nr; i++) {
|
||||
if (indices[i] > end_index) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = dax_writeback_one(bdev, mapping, indices[i],
|
||||
pvec.pages[i]);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
wmb_pmem();
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
|
||||
|
||||
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||
struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
|
@ -363,6 +558,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
|||
}
|
||||
dax_unmap_atomic(bdev, &dax);
|
||||
|
||||
error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
|
||||
vmf->flags & FAULT_FLAG_WRITE);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
error = vm_insert_mixed(vma, vaddr, dax.pfn);
|
||||
|
||||
out:
|
||||
|
@ -408,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
bh.b_size = PAGE_SIZE;
|
||||
|
||||
repeat:
|
||||
|
@ -487,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
delete_from_page_cache(page);
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
page = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -590,7 +792,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
|||
struct block_device *bdev;
|
||||
pgoff_t size, pgoff;
|
||||
sector_t block;
|
||||
int result = 0;
|
||||
int error, result = 0;
|
||||
bool alloc = false;
|
||||
|
||||
/* dax pmd mappings require pfn_t_devmap() */
|
||||
if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
|
||||
|
@ -624,13 +827,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
|||
}
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
|
||||
|
||||
bh.b_size = PMD_SIZE;
|
||||
if (get_block(inode, block, &bh, write) != 0)
|
||||
|
||||
if (get_block(inode, block, &bh, 0) != 0)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
if (!buffer_mapped(&bh) && write) {
|
||||
if (get_block(inode, block, &bh, 1) != 0)
|
||||
return VM_FAULT_SIGBUS;
|
||||
alloc = true;
|
||||
}
|
||||
|
||||
bdev = bh.b_bdev;
|
||||
i_mmap_lock_read(mapping);
|
||||
|
||||
/*
|
||||
* If the filesystem isn't willing to tell us the length of a hole,
|
||||
|
@ -639,19 +850,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
|||
*/
|
||||
if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
|
||||
dax_pmd_dbg(&bh, address, "allocated block too small");
|
||||
goto fallback;
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we allocated new storage, make sure no process has any
|
||||
* zero pages covering this hole
|
||||
*/
|
||||
if (buffer_new(&bh)) {
|
||||
i_mmap_unlock_read(mapping);
|
||||
unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
|
||||
i_mmap_lock_read(mapping);
|
||||
if (alloc) {
|
||||
loff_t lstart = pgoff << PAGE_SHIFT;
|
||||
loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
|
||||
|
||||
truncate_pagecache_range(inode, lstart, lend);
|
||||
}
|
||||
|
||||
i_mmap_lock_read(mapping);
|
||||
|
||||
/*
|
||||
* If a truncate happened while we were allocating blocks, we may
|
||||
* leave blocks allocated to the file that are beyond EOF. We can't
|
||||
|
@ -664,7 +878,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
|||
goto out;
|
||||
}
|
||||
if ((pgoff | PG_PMD_COLOUR) >= size) {
|
||||
dax_pmd_dbg(&bh, address, "pgoff unaligned");
|
||||
dax_pmd_dbg(&bh, address,
|
||||
"offset + huge page size > file size");
|
||||
goto fallback;
|
||||
}
|
||||
|
||||
|
@ -732,6 +947,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
|||
}
|
||||
dax_unmap_atomic(bdev, &dax);
|
||||
|
||||
/*
|
||||
* For PTE faults we insert a radix tree entry for reads, and
|
||||
* leave it clean. Then on the first write we dirty the radix
|
||||
* tree entry via the dax_pfn_mkwrite() path. This sequence
|
||||
* allows the dax_pfn_mkwrite() call to be simpler and avoid a
|
||||
* call into get_block() to translate the pgoff to a sector in
|
||||
* order to be able to create a new radix tree entry.
|
||||
*
|
||||
* The PMD path doesn't have an equivalent to
|
||||
* dax_pfn_mkwrite(), though, so for a read followed by a
|
||||
* write we traverse all the way through __dax_pmd_fault()
|
||||
* twice. This means we can just skip inserting a radix tree
|
||||
* entry completely on the initial read and just wait until
|
||||
* the write to insert a dirty entry.
|
||||
*/
|
||||
if (write) {
|
||||
error = dax_radix_entry(mapping, pgoff, dax.sector,
|
||||
true, true);
|
||||
if (error) {
|
||||
dax_pmd_dbg(&bh, address,
|
||||
"PMD radix insertion failed");
|
||||
goto fallback;
|
||||
}
|
||||
}
|
||||
|
||||
dev_dbg(part_to_dev(bdev->bd_part),
|
||||
"%s: %s addr: %lx pfn: %lx sect: %llx\n",
|
||||
__func__, current->comm, address,
|
||||
|
@ -790,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
|
|||
* dax_pfn_mkwrite - handle first write to DAX page
|
||||
* @vma: The virtual memory area where the fault occurred
|
||||
* @vmf: The description of the fault
|
||||
*
|
||||
*/
|
||||
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||
struct file *file = vma->vm_file;
|
||||
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
sb_end_pagefault(sb);
|
||||
/*
|
||||
* We pass NO_SECTOR to dax_radix_entry() because we expect that a
|
||||
* RADIX_DAX_PTE entry already exists in the radix tree from a
|
||||
* previous call to __dax_fault(). We just want to look up that PTE
|
||||
* entry using vmf->pgoff and make sure the dirty tag is set. This
|
||||
* saves us from having to make a call to get_block() here to look
|
||||
* up the sector.
|
||||
*/
|
||||
dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
|
||||
return VM_FAULT_NOPAGE;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
|
||||
|
@ -835,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
|
|||
BUG_ON((offset + length) > PAGE_CACHE_SIZE);
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
bh.b_size = PAGE_CACHE_SIZE;
|
||||
err = get_block(inode, index, &bh, 0);
|
||||
if (err < 0)
|
||||
|
|
|
@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
|
|||
{
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
struct ext2_inode_info *ei = EXT2_I(inode);
|
||||
int ret = VM_FAULT_NOPAGE;
|
||||
loff_t size;
|
||||
int ret;
|
||||
|
||||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vma->vm_file);
|
||||
|
@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
|
|||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
else
|
||||
ret = dax_pfn_mkwrite(vma, vmf);
|
||||
|
||||
up_read(&ei->dax_sem);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
|
|
|
@ -291,8 +291,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
|
|||
{
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
struct super_block *sb = inode->i_sb;
|
||||
int ret = VM_FAULT_NOPAGE;
|
||||
loff_t size;
|
||||
int ret;
|
||||
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
|
@ -300,6 +300,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
|
|||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
else
|
||||
ret = dax_pfn_mkwrite(vma, vmf);
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
sb_end_pagefault(sb);
|
||||
|
||||
|
|
|
@ -495,7 +495,7 @@ void clear_inode(struct inode *inode)
|
|||
*/
|
||||
spin_lock_irq(&inode->i_data.tree_lock);
|
||||
BUG_ON(inode->i_data.nrpages);
|
||||
BUG_ON(inode->i_data.nrshadows);
|
||||
BUG_ON(inode->i_data.nrexceptional);
|
||||
spin_unlock_irq(&inode->i_data.tree_lock);
|
||||
BUG_ON(!list_empty(&inode->i_data.private_list));
|
||||
BUG_ON(!(inode->i_state & I_FREEING));
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/mtd/mtd.h>
|
||||
#include <linux/mm.h> /* kvfree() */
|
||||
#include "nodelist.h"
|
||||
|
||||
static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
|
||||
|
@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
|
|||
return 0;
|
||||
|
||||
out_free:
|
||||
#ifndef __ECOS
|
||||
if (jffs2_blocks_use_vmalloc(c))
|
||||
vfree(c->blocks);
|
||||
else
|
||||
#endif
|
||||
kfree(c->blocks);
|
||||
kvfree(c->blocks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
|
|||
out_root:
|
||||
jffs2_free_ino_caches(c);
|
||||
jffs2_free_raw_node_refs(c);
|
||||
if (jffs2_blocks_use_vmalloc(c))
|
||||
vfree(c->blocks);
|
||||
else
|
||||
kfree(c->blocks);
|
||||
kvfree(c->blocks);
|
||||
out_inohash:
|
||||
jffs2_clear_xattr_subsystem(c);
|
||||
kfree(c->inocache_list);
|
||||
|
|
|
@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
|
|||
|
||||
jffs2_free_ino_caches(c);
|
||||
jffs2_free_raw_node_refs(c);
|
||||
if (jffs2_blocks_use_vmalloc(c))
|
||||
vfree(c->blocks);
|
||||
else
|
||||
kfree(c->blocks);
|
||||
kvfree(c->blocks);
|
||||
jffs2_flash_cleanup(c);
|
||||
kfree(c->inocache_list);
|
||||
jffs2_clear_xattr_subsystem(c);
|
||||
|
|
|
@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
|
|||
{
|
||||
int i;
|
||||
int nr_groups = bitmap->s_nr_groups;
|
||||
int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
|
||||
nr_groups);
|
||||
|
||||
for (i = 0; i < nr_groups; i++)
|
||||
if (bitmap->s_block_bitmap[i])
|
||||
brelse(bitmap->s_block_bitmap[i]);
|
||||
|
||||
if (size <= PAGE_SIZE)
|
||||
kfree(bitmap);
|
||||
else
|
||||
vfree(bitmap);
|
||||
kvfree(bitmap);
|
||||
}
|
||||
|
||||
static void udf_free_partition(struct udf_part_map *map)
|
||||
|
|
|
@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault(
|
|||
/*
|
||||
* pfn_mkwrite was originally inteneded to ensure we capture time stamp
|
||||
* updates on write faults. In reality, it's need to serialise against
|
||||
* truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
|
||||
* here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
|
||||
* barrier in place.
|
||||
* truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
|
||||
* to ensure we serialise the fault barrier in place.
|
||||
*/
|
||||
static int
|
||||
xfs_filemap_pfn_mkwrite(
|
||||
|
@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite(
|
|||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
else if (IS_DAX(inode))
|
||||
ret = dax_pfn_mkwrite(vma, vmf);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
return ret;
|
||||
|
|
|
@ -36,4 +36,11 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
|
|||
{
|
||||
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
||||
}
|
||||
|
||||
static inline bool dax_mapping(struct address_space *mapping)
|
||||
{
|
||||
return mapping->host && IS_DAX(mapping->host);
|
||||
}
|
||||
int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
|
||||
loff_t end);
|
||||
#endif
|
||||
|
|
|
@ -433,7 +433,8 @@ struct address_space {
|
|||
struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
|
||||
/* Protected by tree_lock together with the radix tree */
|
||||
unsigned long nrpages; /* number of total pages */
|
||||
unsigned long nrshadows; /* number of shadow entries */
|
||||
/* number of shadow or DAX exceptional entries */
|
||||
unsigned long nrexceptional;
|
||||
pgoff_t writeback_index;/* writeback starts here */
|
||||
const struct address_space_operations *a_ops; /* methods */
|
||||
unsigned long flags; /* error bits/gfp mask */
|
||||
|
|
|
@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
|
|||
unsigned int nr_pages, struct page **pages);
|
||||
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
|
||||
int tag, unsigned int nr_pages, struct page **pages);
|
||||
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
|
||||
int tag, unsigned int nr_entries,
|
||||
struct page **entries, pgoff_t *indices);
|
||||
|
||||
struct page *grab_cache_page_write_begin(struct address_space *mapping,
|
||||
pgoff_t index, unsigned flags);
|
||||
|
|
|
@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
|
|||
{
|
||||
BUG();
|
||||
}
|
||||
|
||||
static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Architectures that define ARCH_HAS_PMEM_API must provide
|
||||
* implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
|
||||
* arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
|
||||
* arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem()
|
||||
* and arch_has_wmb_pmem().
|
||||
*/
|
||||
static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
|
||||
{
|
||||
|
@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size)
|
|||
else
|
||||
default_clear_pmem(addr, size);
|
||||
}
|
||||
|
||||
/**
|
||||
* wb_cache_pmem - write back processor cache for PMEM memory range
|
||||
* @addr: virtual start address
|
||||
* @size: number of bytes to write back
|
||||
*
|
||||
* Write back the processor cache range starting at 'addr' for 'size' bytes.
|
||||
* This function requires explicit ordering with a wmb_pmem() call.
|
||||
*/
|
||||
static inline void wb_cache_pmem(void __pmem *addr, size_t size)
|
||||
{
|
||||
if (arch_has_pmem_api())
|
||||
arch_wb_cache_pmem(addr, size);
|
||||
}
|
||||
#endif /* __PMEM_H__ */
|
||||
|
|
|
@ -51,6 +51,15 @@
|
|||
#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
|
||||
#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
|
||||
|
||||
#define RADIX_DAX_MASK 0xf
|
||||
#define RADIX_DAX_SHIFT 4
|
||||
#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
|
||||
#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
|
||||
#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
|
||||
#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
|
||||
#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
|
||||
RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
|
||||
|
||||
static inline int radix_tree_is_indirect_ptr(void *ptr)
|
||||
{
|
||||
return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
|
||||
|
|
|
@ -1493,7 +1493,7 @@ out_rcu_wakeup:
|
|||
wake_up_sem_queue_do(&tasks);
|
||||
out_free:
|
||||
if (sem_io != fast_sem_io)
|
||||
ipc_free(sem_io, sizeof(ushort)*nsems);
|
||||
ipc_free(sem_io);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
|
11
ipc/util.c
11
ipc/util.c
|
@ -414,17 +414,12 @@ void *ipc_alloc(int size)
|
|||
/**
|
||||
* ipc_free - free ipc space
|
||||
* @ptr: pointer returned by ipc_alloc
|
||||
* @size: size of block
|
||||
*
|
||||
* Free a block created with ipc_alloc(). The caller must know the size
|
||||
* used in the allocation call.
|
||||
* Free a block created with ipc_alloc().
|
||||
*/
|
||||
void ipc_free(void *ptr, int size)
|
||||
void ipc_free(void *ptr)
|
||||
{
|
||||
if (size > PAGE_SIZE)
|
||||
vfree(ptr);
|
||||
else
|
||||
kfree(ptr);
|
||||
kvfree(ptr);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
|
|||
* both function can sleep
|
||||
*/
|
||||
void *ipc_alloc(int size);
|
||||
void ipc_free(void *ptr, int size);
|
||||
void ipc_free(void *ptr);
|
||||
|
||||
/*
|
||||
* For allocation that need to be freed by RCU.
|
||||
|
|
91
mm/filemap.c
91
mm/filemap.c
|
@ -11,6 +11,7 @@
|
|||
*/
|
||||
#include <linux/export.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/capability.h>
|
||||
|
@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
|
|||
__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
|
||||
|
||||
if (shadow) {
|
||||
mapping->nrshadows++;
|
||||
mapping->nrexceptional++;
|
||||
/*
|
||||
* Make sure the nrshadows update is committed before
|
||||
* Make sure the nrexceptional update is committed before
|
||||
* the nrpages update so that final truncate racing
|
||||
* with reclaim does not see both counters 0 at the
|
||||
* same time and miss a shadow entry.
|
||||
|
@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
|
|||
{
|
||||
int err = 0;
|
||||
|
||||
if (dax_mapping(mapping) && mapping->nrexceptional) {
|
||||
err = dax_writeback_mapping_range(mapping, lstart, lend);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (mapping->nrpages) {
|
||||
err = __filemap_fdatawrite_range(mapping, lstart, lend,
|
||||
WB_SYNC_ALL);
|
||||
|
@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
|
|||
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
|
||||
if (!radix_tree_exceptional_entry(p))
|
||||
return -EEXIST;
|
||||
|
||||
if (WARN_ON(dax_mapping(mapping)))
|
||||
return -EINVAL;
|
||||
|
||||
if (shadowp)
|
||||
*shadowp = p;
|
||||
mapping->nrshadows--;
|
||||
mapping->nrexceptional--;
|
||||
if (node)
|
||||
workingset_node_shadows_dec(node);
|
||||
}
|
||||
|
@ -1245,9 +1256,9 @@ repeat:
|
|||
if (radix_tree_deref_retry(page))
|
||||
goto restart;
|
||||
/*
|
||||
* A shadow entry of a recently evicted page,
|
||||
* or a swap entry from shmem/tmpfs. Return
|
||||
* it without attempting to raise page count.
|
||||
* A shadow entry of a recently evicted page, a swap
|
||||
* entry from shmem/tmpfs or a DAX entry. Return it
|
||||
* without attempting to raise page count.
|
||||
*/
|
||||
goto export;
|
||||
}
|
||||
|
@ -1494,6 +1505,74 @@ repeat:
|
|||
}
|
||||
EXPORT_SYMBOL(find_get_pages_tag);
|
||||
|
||||
/**
|
||||
* find_get_entries_tag - find and return entries that match @tag
|
||||
* @mapping: the address_space to search
|
||||
* @start: the starting page cache index
|
||||
* @tag: the tag index
|
||||
* @nr_entries: the maximum number of entries
|
||||
* @entries: where the resulting entries are placed
|
||||
* @indices: the cache indices corresponding to the entries in @entries
|
||||
*
|
||||
* Like find_get_entries, except we only return entries which are tagged with
|
||||
* @tag.
|
||||
*/
|
||||
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
|
||||
int tag, unsigned int nr_entries,
|
||||
struct page **entries, pgoff_t *indices)
|
||||
{
|
||||
void **slot;
|
||||
unsigned int ret = 0;
|
||||
struct radix_tree_iter iter;
|
||||
|
||||
if (!nr_entries)
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
restart:
|
||||
radix_tree_for_each_tagged(slot, &mapping->page_tree,
|
||||
&iter, start, tag) {
|
||||
struct page *page;
|
||||
repeat:
|
||||
page = radix_tree_deref_slot(slot);
|
||||
if (unlikely(!page))
|
||||
continue;
|
||||
if (radix_tree_exception(page)) {
|
||||
if (radix_tree_deref_retry(page)) {
|
||||
/*
|
||||
* Transient condition which can only trigger
|
||||
* when entry at index 0 moves out of or back
|
||||
* to root: none yet gotten, safe to restart.
|
||||
*/
|
||||
goto restart;
|
||||
}
|
||||
|
||||
/*
|
||||
* A shadow entry of a recently evicted page, a swap
|
||||
* entry from shmem/tmpfs or a DAX entry. Return it
|
||||
* without attempting to raise page count.
|
||||
*/
|
||||
goto export;
|
||||
}
|
||||
if (!page_cache_get_speculative(page))
|
||||
goto repeat;
|
||||
|
||||
/* Has the page moved? */
|
||||
if (unlikely(page != *slot)) {
|
||||
page_cache_release(page);
|
||||
goto repeat;
|
||||
}
|
||||
export:
|
||||
indices[ret] = iter.index;
|
||||
entries[ret] = page;
|
||||
if (++ret == nr_entries)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(find_get_entries_tag);
|
||||
|
||||
/*
|
||||
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
|
||||
* a _large_ part of the i/o request. Imagine the worst scenario:
|
||||
|
|
18
mm/percpu.c
18
mm/percpu.c
|
@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
|
|||
/**
|
||||
* pcpu_mem_free - free memory
|
||||
* @ptr: memory to free
|
||||
* @size: size of the area
|
||||
*
|
||||
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
|
||||
*/
|
||||
static void pcpu_mem_free(void *ptr, size_t size)
|
||||
static void pcpu_mem_free(void *ptr)
|
||||
{
|
||||
if (size <= PAGE_SIZE)
|
||||
kfree(ptr);
|
||||
else
|
||||
vfree(ptr);
|
||||
kvfree(ptr);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -463,8 +459,8 @@ out_unlock:
|
|||
* pcpu_mem_free() might end up calling vfree() which uses
|
||||
* IRQ-unsafe lock and thus can't be called under pcpu_lock.
|
||||
*/
|
||||
pcpu_mem_free(old, old_size);
|
||||
pcpu_mem_free(new, new_size);
|
||||
pcpu_mem_free(old);
|
||||
pcpu_mem_free(new);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
|
|||
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
|
||||
sizeof(chunk->map[0]));
|
||||
if (!chunk->map) {
|
||||
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
|
||||
pcpu_mem_free(chunk);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
|
|||
{
|
||||
if (!chunk)
|
||||
return;
|
||||
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
|
||||
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
|
||||
pcpu_mem_free(chunk->map);
|
||||
pcpu_mem_free(chunk);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
|
@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
|
|||
return;
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
/*
|
||||
* Regular page slots are stabilized by the page lock even
|
||||
* without the tree itself locked. These unlocked entries
|
||||
* need verification under the tree lock.
|
||||
*/
|
||||
if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
|
||||
goto unlock;
|
||||
if (*slot != entry)
|
||||
goto unlock;
|
||||
radix_tree_replace_slot(slot, NULL);
|
||||
mapping->nrshadows--;
|
||||
if (!node)
|
||||
goto unlock;
|
||||
workingset_node_shadows_dec(node);
|
||||
/*
|
||||
* Don't track node without shadow entries.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already untracked.
|
||||
* The list_empty() test is safe as node->private_list is
|
||||
* protected by mapping->tree_lock.
|
||||
*/
|
||||
if (!workingset_node_shadows(node) &&
|
||||
!list_empty(&node->private_list))
|
||||
list_lru_del(&workingset_shadow_nodes, &node->private_list);
|
||||
__radix_tree_delete_node(&mapping->page_tree, node);
|
||||
|
||||
if (dax_mapping(mapping)) {
|
||||
if (radix_tree_delete_item(&mapping->page_tree, index, entry))
|
||||
mapping->nrexceptional--;
|
||||
} else {
|
||||
/*
|
||||
* Regular page slots are stabilized by the page lock even
|
||||
* without the tree itself locked. These unlocked entries
|
||||
* need verification under the tree lock.
|
||||
*/
|
||||
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
|
||||
&slot))
|
||||
goto unlock;
|
||||
if (*slot != entry)
|
||||
goto unlock;
|
||||
radix_tree_replace_slot(slot, NULL);
|
||||
mapping->nrexceptional--;
|
||||
if (!node)
|
||||
goto unlock;
|
||||
workingset_node_shadows_dec(node);
|
||||
/*
|
||||
* Don't track node without shadow entries.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already untracked.
|
||||
* The list_empty() test is safe as node->private_list is
|
||||
* protected by mapping->tree_lock.
|
||||
*/
|
||||
if (!workingset_node_shadows(node) &&
|
||||
!list_empty(&node->private_list))
|
||||
list_lru_del(&workingset_shadow_nodes,
|
||||
&node->private_list);
|
||||
__radix_tree_delete_node(&mapping->page_tree, node);
|
||||
}
|
||||
unlock:
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
}
|
||||
|
@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
|||
int i;
|
||||
|
||||
cleancache_invalidate_inode(mapping);
|
||||
if (mapping->nrpages == 0 && mapping->nrshadows == 0)
|
||||
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
||||
return;
|
||||
|
||||
/* Offsets within partial pages */
|
||||
|
@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
|
|||
*/
|
||||
void truncate_inode_pages_final(struct address_space *mapping)
|
||||
{
|
||||
unsigned long nrshadows;
|
||||
unsigned long nrexceptional;
|
||||
unsigned long nrpages;
|
||||
|
||||
/*
|
||||
|
@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
|
|||
|
||||
/*
|
||||
* When reclaim installs eviction entries, it increases
|
||||
* nrshadows first, then decreases nrpages. Make sure we see
|
||||
* nrexceptional first, then decreases nrpages. Make sure we see
|
||||
* this in the right order or we might miss an entry.
|
||||
*/
|
||||
nrpages = mapping->nrpages;
|
||||
smp_rmb();
|
||||
nrshadows = mapping->nrshadows;
|
||||
nrexceptional = mapping->nrexceptional;
|
||||
|
||||
if (nrpages || nrshadows) {
|
||||
if (nrpages || nrexceptional) {
|
||||
/*
|
||||
* As truncation uses a lockless tree lookup, cycle
|
||||
* the tree lock to make sure any ongoing tree
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
#include <linux/oom.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/printk.h>
|
||||
#include <linux/dax.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
|
@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
|||
* inode reclaim needs to empty out the radix tree or
|
||||
* the nodes are lost. Don't plant shadows behind its
|
||||
* back.
|
||||
*
|
||||
* We also don't store shadows for DAX mappings because the
|
||||
* only page cache pages found in these are zero pages
|
||||
* covering holes, and because we don't want to mix DAX
|
||||
* exceptional entries and shadow exceptional entries in the
|
||||
* same page_tree.
|
||||
*/
|
||||
if (reclaimed && page_is_file_cache(page) &&
|
||||
!mapping_exiting(mapping))
|
||||
!mapping_exiting(mapping) && !dax_mapping(mapping))
|
||||
shadow = workingset_eviction(mapping, page);
|
||||
__delete_from_page_cache(page, shadow, memcg);
|
||||
spin_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
|
|
|
@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
|||
node->slots[i] = NULL;
|
||||
BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
|
||||
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
|
||||
BUG_ON(!mapping->nrshadows);
|
||||
mapping->nrshadows--;
|
||||
BUG_ON(!mapping->nrexceptional);
|
||||
mapping->nrexceptional--;
|
||||
}
|
||||
}
|
||||
BUG_ON(node->count);
|
||||
|
|
|
@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
|
|||
|
||||
if (!n->tn_bits)
|
||||
kmem_cache_free(trie_leaf_kmem, n);
|
||||
else if (n->tn_bits <= TNODE_KMALLOC_MAX)
|
||||
kfree(n);
|
||||
else
|
||||
vfree(n);
|
||||
kvfree(n);
|
||||
}
|
||||
|
||||
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
|
||||
|
|
Загрузка…
Ссылка в новой задаче