mm: allow page fault handlers to perform the COW

Currently COW of an XIP file is done by first bringing in a read-only
mapping, then retrying the fault and copying the page.  It is much more
efficient to tell the fault handler that a COW is being attempted (by
passing in the pre-allocated page in the vm_fault structure), and allow
the handler to perform the COW operation itself.

The handler cannot insert the page itself if there is already a read-only
mapping at that address, so allow the handler to return VM_FAULT_LOCKED
and set the fault_page to be NULL.  This indicates to the MM code that the
i_mmap_lock is held instead of the page lock.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Matthew Wilcox 2015-02-16 15:58:50 -08:00 коммит произвёл Linus Torvalds
Родитель 283307c760
Коммит 2e4cdab058
2 изменённых файлов: 33 добавлений и 9 удалений

Просмотреть файл

@ -224,6 +224,7 @@ struct vm_fault {
pgoff_t pgoff; /* Logical page offset based on vma */ pgoff_t pgoff; /* Logical page offset based on vma */
void __user *virtual_address; /* Faulting virtual address */ void __user *virtual_address; /* Faulting virtual address */
struct page *cow_page; /* Handler may choose to COW */
struct page *page; /* ->fault handlers should return a struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE * page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by * is set (which is also implied by

Просмотреть файл

@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
vmf.pgoff = page->index; vmf.pgoff = page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = page; vmf.page = page;
vmf.cow_page = NULL;
ret = vma->vm_ops->page_mkwrite(vma, &vmf); ret = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@ -2639,7 +2640,8 @@ oom:
* See filemap_fault() and __lock_page_retry(). * See filemap_fault() and __lock_page_retry().
*/ */
static int __do_fault(struct vm_area_struct *vma, unsigned long address, static int __do_fault(struct vm_area_struct *vma, unsigned long address,
pgoff_t pgoff, unsigned int flags, struct page **page) pgoff_t pgoff, unsigned int flags,
struct page *cow_page, struct page **page)
{ {
struct vm_fault vmf; struct vm_fault vmf;
int ret; int ret;
@ -2648,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff; vmf.pgoff = pgoff;
vmf.flags = flags; vmf.flags = flags;
vmf.page = NULL; vmf.page = NULL;
vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf); ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret; return ret;
if (!vmf.page)
goto out;
if (unlikely(PageHWPoison(vmf.page))) { if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED) if (ret & VM_FAULT_LOCKED)
@ -2665,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
else else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
out:
*page = vmf.page; *page = vmf.page;
return ret; return ret;
} }
@ -2835,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl); pte_unmap_unlock(pte, ptl);
} }
ret = __do_fault(vma, address, pgoff, flags, &fault_page); ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret; return ret;
@ -2875,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM; return VM_FAULT_OOM;
} }
ret = __do_fault(vma, address, pgoff, flags, &fault_page); ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out; goto uncharge_out;
copy_user_highpage(new_page, fault_page, address, vma); if (fault_page)
copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page); __SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl); pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) { if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl); pte_unmap_unlock(pte, ptl);
unlock_page(fault_page); if (fault_page) {
page_cache_release(fault_page); unlock_page(fault_page);
page_cache_release(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
}
goto uncharge_out; goto uncharge_out;
} }
do_set_pte(vma, address, new_page, pte, true, true); do_set_pte(vma, address, new_page, pte, true, true);
mem_cgroup_commit_charge(new_page, memcg, false); mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma); lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl); pte_unmap_unlock(pte, ptl);
unlock_page(fault_page); if (fault_page) {
page_cache_release(fault_page); unlock_page(fault_page);
page_cache_release(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
* i_mmap_lock for read to protect against truncate.
*/
i_mmap_unlock_read(vma->vm_file->f_mapping);
}
return ret; return ret;
uncharge_out: uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg); mem_cgroup_cancel_charge(new_page, memcg);
@ -2913,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0; int dirtied = 0;
int ret, tmp; int ret, tmp;
ret = __do_fault(vma, address, pgoff, flags, &fault_page); ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret; return ret;