diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 862fc07dc6c0..8f1e390fd71b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -219,6 +219,9 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { int ret; + if (unlikely(size == 0)) + return 0; + /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. @@ -238,19 +241,23 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) return ret; } -static inline void fault_in_pages_readable(const char __user *uaddr, int size) +static inline int fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; int ret; + if (unlikely(size == 0)) + return 0; + ret = __get_user(c, uaddr); if (ret == 0) { const char __user *end = uaddr + size - 1; if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) - __get_user(c, end); + ret = __get_user(c, end); } + return ret; } #endif /* _LINUX_PAGEMAP_H */ diff --git a/mm/filemap.c b/mm/filemap.c index c59d5b3cd99a..557fd887254f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1826,11 +1826,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, written); do { + struct page *src_page; struct page *page; pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ - unsigned long maxlen; /* Bytes remaining in current iovec */ - size_t bytes; /* Bytes to write to page */ + unsigned long seglen; /* Bytes remaining in current iovec */ + unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ buf = cur_iov->iov_base + iov_offset; @@ -1840,20 +1841,30 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (bytes > count) bytes = count; - maxlen = cur_iov->iov_len - iov_offset; - if (maxlen > bytes) - maxlen = bytes; + /* + * a non-NULL src_page indicates that we're doing the + * copy via get_user_pages and kmap. + */ + src_page = NULL; + + seglen = cur_iov->iov_len - iov_offset; + if (seglen > bytes) + seglen = bytes; -#ifndef CONFIG_DEBUG_VM /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. */ - fault_in_pages_readable(buf, maxlen); -#endif - + if (unlikely(fault_in_pages_readable(buf, seglen))) { + status = -EFAULT; + break; + } page = __grab_cache_page(mapping, index); if (!page) { @@ -1861,32 +1872,104 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, break; } + /* + * non-uptodate pages cannot cope with short copies, and we + * cannot take a pagefault with the destination page locked. + * So pin the source page to copy it. + */ + if (!PageUptodate(page)) { + unlock_page(page); + + src_page = alloc_page(GFP_KERNEL); + if (!src_page) { + page_cache_release(page); + status = -ENOMEM; + break; + } + + /* + * Cannot get_user_pages with a page locked for the + * same reason as we can't take a page fault with a + * page locked (as explained below). + */ + copied = filemap_copy_from_user(src_page, offset, + cur_iov, nr_segs, iov_offset, bytes); + if (unlikely(copied == 0)) { + status = -EFAULT; + page_cache_release(page); + page_cache_release(src_page); + break; + } + bytes = copied; + + lock_page(page); + /* + * Can't handle the page going uptodate here, because + * that means we would use non-atomic usercopies, which + * zero out the tail of the page, which can cause + * zeroes to become transiently visible. We could just + * use a non-zeroing copy, but the APIs aren't too + * consistent. + */ + if (unlikely(!page->mapping || PageUptodate(page))) { + unlock_page(page); + page_cache_release(page); + page_cache_release(src_page); + continue; + } + + } + status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) goto fs_write_aop_error; - copied = filemap_copy_from_user(page, offset, + if (!src_page) { + /* + * Must not enter the pagefault handler here, because + * we hold the page lock, so we might recursively + * deadlock on the same lock, or get an ABBA deadlock + * against a different lock, or against the mmap_sem + * (which nests outside the page lock). So increment + * preempt count, and use _atomic usercopies. + * + * The page is uptodate so we are OK to encounter a + * short copy: if unmodified parts of the page are + * marked dirty and written out to disk, it doesn't + * really matter. + */ + pagefault_disable(); + copied = filemap_copy_from_user_atomic(page, offset, cur_iov, nr_segs, iov_offset, bytes); + pagefault_enable(); + } else { + void *src, *dst; + src = kmap_atomic(src_page, KM_USER0); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + offset, bytes); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + copied = bytes; + } flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE)) goto fs_write_aop_error; - if (unlikely(copied != bytes)) { - status = -EFAULT; - goto fs_write_aop_error; - } if (unlikely(status > 0)) /* filesystem did partial write */ - copied = status; + copied = min_t(size_t, copied, status); + + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + if (src_page) + page_cache_release(src_page); written += copied; count -= copied; pos += copied; filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, copied); - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); balance_dirty_pages_ratelimited(mapping); cond_resched(); continue; @@ -1895,6 +1978,8 @@ fs_write_aop_error: if (status != AOP_TRUNCATED_PAGE) unlock_page(page); page_cache_release(page); + if (src_page) + page_cache_release(src_page); /* * prepare_write() may have instantiated a few blocks @@ -1907,7 +1992,6 @@ fs_write_aop_error: continue; else break; - } while (count); *ppos = pos;