2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* linux/mm/fremap.c
|
|
|
|
*
|
|
|
|
* Explicit pagetable population and nonlinear (random) mappings support.
|
|
|
|
*
|
|
|
|
* started by Ingo Molnar, Copyright (C) 2002, 2003
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/swapops.h>
|
|
|
|
#include <linux/rmap.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/syscalls.h>
|
|
|
|
|
|
|
|
#include <asm/mmu_context.h>
|
|
|
|
#include <asm/cacheflush.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
2005-10-30 04:16:17 +03:00
|
|
|
static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
2005-04-17 02:20:36 +04:00
|
|
|
unsigned long addr, pte_t *ptep)
|
|
|
|
{
|
|
|
|
pte_t pte = *ptep;
|
2005-10-30 04:16:17 +03:00
|
|
|
struct page *page = NULL;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
if (pte_present(pte)) {
|
|
|
|
unsigned long pfn = pte_pfn(pte);
|
|
|
|
flush_cache_page(vma, addr, pfn);
|
|
|
|
pte = ptep_clear_flush(vma, addr, ptep);
|
2005-10-30 04:16:12 +03:00
|
|
|
if (unlikely(!pfn_valid(pfn))) {
|
|
|
|
print_bad_pte(vma, pte, addr);
|
2005-10-30 04:16:17 +03:00
|
|
|
goto out;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2005-10-30 04:16:12 +03:00
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
if (pte_dirty(pte))
|
|
|
|
set_page_dirty(page);
|
|
|
|
page_remove_rmap(page);
|
|
|
|
page_cache_release(page);
|
2005-04-17 02:20:36 +04:00
|
|
|
} else {
|
|
|
|
if (!pte_file(pte))
|
|
|
|
free_swap_and_cache(pte_to_swp_entry(pte));
|
|
|
|
pte_clear(mm, addr, ptep);
|
|
|
|
}
|
2005-10-30 04:16:17 +03:00
|
|
|
out:
|
|
|
|
return !!page;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Install a file page to a given virtual memory address, release any
|
|
|
|
* previously existing mapping.
|
|
|
|
*/
|
|
|
|
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, struct page *page, pgprot_t prot)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
pgoff_t size;
|
|
|
|
int err = -ENOMEM;
|
|
|
|
pte_t *pte;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pud_t *pud;
|
|
|
|
pgd_t *pgd;
|
|
|
|
pte_t pte_val;
|
2005-10-30 04:16:23 +03:00
|
|
|
spinlock_t *ptl;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
[PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.
Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.
Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).
Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 08:32:15 +03:00
|
|
|
BUG_ON(vma->vm_flags & VM_UNPAGED);
|
2005-10-30 04:16:12 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
pgd = pgd_offset(mm, addr);
|
|
|
|
pud = pud_alloc(mm, pgd, addr);
|
|
|
|
if (!pud)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto out;
|
2005-04-17 02:20:36 +04:00
|
|
|
pmd = pmd_alloc(mm, pud, addr);
|
|
|
|
if (!pmd)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto out;
|
|
|
|
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (!pte)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto out;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This page may have been truncated. Tell the
|
|
|
|
* caller about it.
|
|
|
|
*/
|
|
|
|
err = -EINVAL;
|
|
|
|
inode = vma->vm_file->f_mapping->host;
|
|
|
|
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
|
|
|
if (!page->mapping || page->index >= size)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto unlock;
|
2005-10-11 22:16:26 +04:00
|
|
|
err = -ENOMEM;
|
|
|
|
if (page_mapcount(page) > INT_MAX/2)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto unlock;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2005-10-30 04:16:17 +03:00
|
|
|
if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
|
|
|
|
inc_mm_counter(mm, file_rss);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
flush_icache_page(vma, page);
|
|
|
|
set_pte_at(mm, addr, pte, mk_pte(page, prot));
|
|
|
|
page_add_file_rmap(page);
|
|
|
|
pte_val = *pte;
|
|
|
|
update_mmu_cache(vma, addr, pte_val);
|
|
|
|
err = 0;
|
2005-10-30 04:16:23 +03:00
|
|
|
unlock:
|
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
out:
|
2005-04-17 02:20:36 +04:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(install_page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Install a file pte to a given virtual memory address, release any
|
|
|
|
* previously existing mapping.
|
|
|
|
*/
|
|
|
|
int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, unsigned long pgoff, pgprot_t prot)
|
|
|
|
{
|
|
|
|
int err = -ENOMEM;
|
|
|
|
pte_t *pte;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pud_t *pud;
|
|
|
|
pgd_t *pgd;
|
|
|
|
pte_t pte_val;
|
2005-10-30 04:16:23 +03:00
|
|
|
spinlock_t *ptl;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
[PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.
Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.
Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).
Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 08:32:15 +03:00
|
|
|
BUG_ON(vma->vm_flags & VM_UNPAGED);
|
2005-10-30 04:16:12 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
pgd = pgd_offset(mm, addr);
|
|
|
|
pud = pud_alloc(mm, pgd, addr);
|
|
|
|
if (!pud)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto out;
|
2005-04-17 02:20:36 +04:00
|
|
|
pmd = pmd_alloc(mm, pud, addr);
|
|
|
|
if (!pmd)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto out;
|
|
|
|
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (!pte)
|
2005-10-30 04:16:23 +03:00
|
|
|
goto out;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
|
|
|
if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
|
|
|
|
update_hiwater_rss(mm);
|
2005-10-30 04:16:17 +03:00
|
|
|
dec_mm_counter(mm, file_rss);
|
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
|
|
|
|
pte_val = *pte;
|
|
|
|
update_mmu_cache(vma, addr, pte_val);
|
2005-10-30 04:16:23 +03:00
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
err = 0;
|
|
|
|
out:
|
2005-04-17 02:20:36 +04:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/***
|
|
|
|
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
|
|
|
|
* file within an existing vma.
|
|
|
|
* @start: start of the remapped virtual memory range
|
|
|
|
* @size: size of the remapped virtual memory range
|
|
|
|
* @prot: new protection bits of the range
|
|
|
|
* @pgoff: to be mapped page of the backing store file
|
|
|
|
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
|
|
|
|
*
|
|
|
|
* this syscall works purely via pagetables, so it's the most efficient
|
|
|
|
* way to map the same (large) file into a given virtual window. Unlike
|
|
|
|
* mmap()/mremap() it does not create any new vmas. The new mappings are
|
|
|
|
* also safe across swapout.
|
|
|
|
*
|
|
|
|
* NOTE: the 'prot' parameter right now is ignored, and the vma's default
|
|
|
|
* protection is used. Arbitrary protections might be implemented in the
|
|
|
|
* future.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
|
|
|
unsigned long __prot, unsigned long pgoff, unsigned long flags)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
struct address_space *mapping;
|
|
|
|
unsigned long end = start + size;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
int err = -EINVAL;
|
|
|
|
int has_write_lock = 0;
|
|
|
|
|
|
|
|
if (__prot)
|
|
|
|
return err;
|
|
|
|
/*
|
|
|
|
* Sanitize the syscall parameters:
|
|
|
|
*/
|
|
|
|
start = start & PAGE_MASK;
|
|
|
|
size = size & PAGE_MASK;
|
|
|
|
|
|
|
|
/* Does the address range wrap, or is the span zero-sized? */
|
|
|
|
if (start + size <= start)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
/* Can we represent this offset inside this architecture's pte's? */
|
|
|
|
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
|
|
|
|
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
|
|
|
|
return err;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* We need down_write() to change vma->vm_flags. */
|
|
|
|
down_read(&mm->mmap_sem);
|
|
|
|
retry:
|
|
|
|
vma = find_vma(mm, start);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure the vma is shared, that it supports prefaulting,
|
|
|
|
* and that the remapped range is valid and fully within
|
|
|
|
* the single existing vma. vm_private_data is used as a
|
2005-11-22 08:32:16 +03:00
|
|
|
* swapout cursor in a VM_NONLINEAR vma.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
|
|
|
if (vma && (vma->vm_flags & VM_SHARED) &&
|
2005-11-22 08:32:16 +03:00
|
|
|
(!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
|
2005-04-17 02:20:36 +04:00
|
|
|
vma->vm_ops && vma->vm_ops->populate &&
|
|
|
|
end > start && start >= vma->vm_start &&
|
|
|
|
end <= vma->vm_end) {
|
|
|
|
|
|
|
|
/* Must set VM_NONLINEAR before any pages are populated. */
|
|
|
|
if (pgoff != linear_page_index(vma, start) &&
|
|
|
|
!(vma->vm_flags & VM_NONLINEAR)) {
|
|
|
|
if (!has_write_lock) {
|
|
|
|
up_read(&mm->mmap_sem);
|
|
|
|
down_write(&mm->mmap_sem);
|
|
|
|
has_write_lock = 1;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
mapping = vma->vm_file->f_mapping;
|
|
|
|
spin_lock(&mapping->i_mmap_lock);
|
|
|
|
flush_dcache_mmap_lock(mapping);
|
|
|
|
vma->vm_flags |= VM_NONLINEAR;
|
|
|
|
vma_prio_tree_remove(vma, &mapping->i_mmap);
|
|
|
|
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
|
|
|
|
flush_dcache_mmap_unlock(mapping);
|
|
|
|
spin_unlock(&mapping->i_mmap_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
err = vma->vm_ops->populate(vma, start, size,
|
|
|
|
vma->vm_page_prot,
|
|
|
|
pgoff, flags & MAP_NONBLOCK);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can't clear VM_NONLINEAR because we'd have to do
|
|
|
|
* it after ->populate completes, and that would prevent
|
|
|
|
* downgrading the lock. (Locks can't be upgraded).
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
if (likely(!has_write_lock))
|
|
|
|
up_read(&mm->mmap_sem);
|
|
|
|
else
|
|
|
|
up_write(&mm->mmap_sem);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|