From e4b2749158631e6d74bf14d2ef3804d780e2f770 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 11 May 2016 11:58:47 +0200 Subject: [PATCH 01/20] DAX: move RADIX_DAX_ definitions to dax.c These don't belong in radix-tree.c any more than PAGECACHE_TAG_* do. Let's try to maintain the idea that radix-tree simply implements an abstract data type. Acked-by: Ross Zwisler Reviewed-by: Matthew Wilcox Signed-off-by: NeilBrown Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/dax.c | 9 +++++++++ include/linux/radix-tree.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 2494255c5785..7ef5aef78442 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -32,6 +32,15 @@ #include #include +#define RADIX_DAX_MASK 0xf +#define RADIX_DAX_SHIFT 4 +#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 51a97ac8bfbf..d08d6ec3bf53 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -52,15 +52,6 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 -#define RADIX_DAX_MASK 0xf -#define RADIX_DAX_SHIFT 4 -#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) - static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); From 02fbd139759feb1f331cebd858523b5d774082e6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 May 2016 11:58:48 +0200 Subject: [PATCH 02/20] dax: Remove complete_unwritten argument Fault handlers currently take complete_unwritten argument to convert unwritten extents after PTEs are updated. However no filesystem uses this anymore as the code is racy. Remove the unused argument. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/block_dev.c | 4 ++-- fs/dax.c | 43 +++++++++---------------------------------- fs/ext2/file.c | 4 ++-- fs/ext4/file.c | 4 ++-- fs/xfs/xfs_file.c | 7 +++---- include/linux/dax.h | 17 +++++++---------- include/linux/fs.h | 1 - 7 files changed, 25 insertions(+), 55 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 20a2c02b77c4..b25bb230b28a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1746,7 +1746,7 @@ static const struct address_space_operations def_blk_aops = { */ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return __dax_fault(vma, vmf, blkdev_get_block, NULL); + return __dax_fault(vma, vmf, blkdev_get_block); } static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, @@ -1758,7 +1758,7 @@ static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { - return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block); } static const struct vm_operations_struct blkdev_dax_vm_ops = { diff --git a/fs/dax.c b/fs/dax.c index 7ef5aef78442..83e7894d86d8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -612,19 +612,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks - * @complete_unwritten: The filesystem method used to convert unwritten blocks - * to written so the data written to them is exposed. This is required for - * required by write faults for filesystems that will return unwritten - * extent mappings from @get_block, but it is optional for reads as - * dax_insert_mapping() will always zero unwritten blocks. If the fs does - * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -727,23 +721,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page = NULL; } - /* - * If we successfully insert the new mapping over an unwritten extent, - * we need to ensure we convert the unwritten extent. If there is an - * error inserting the mapping, the filesystem needs to leave it as - * unwritten to prevent exposure of the stale underlying data to - * userspace, but we still need to call the completion function so - * the private resources on the mapping buffer can be released. We - * indicate what the callback should do via the uptodate variable, same - * as for normal BH based IO completions. - */ + /* Filesystem should not return unwritten buffers to us! */ + WARN_ON_ONCE(buffer_unwritten(&bh)); error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) { - if (complete_unwritten) - complete_unwritten(&bh, !error); - else - WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); - } out: if (error == -ENOMEM) @@ -772,7 +752,7 @@ EXPORT_SYMBOL(__dax_fault); * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) + get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -781,7 +761,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_fault(vma, vmf, get_block, complete_unwritten); + result = __dax_fault(vma, vmf, get_block); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -815,8 +795,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -875,6 +854,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; + WARN_ON_ONCE(buffer_unwritten(&bh)); } bdev = bh.b_bdev; @@ -1020,9 +1000,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, out: i_mmap_unlock_read(mapping); - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - return result; fallback: @@ -1042,8 +1019,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault); * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) + pmd_t *pmd, unsigned int flags, get_block_t get_block) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -1052,8 +1028,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = __dax_pmd_fault(vma, address, pmd, flags, get_block, - complete_unwritten); + result = __dax_pmd_fault(vma, address, pmd, flags, get_block); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c1400b109805..868c02317b05 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = __dax_fault(vma, vmf, ext2_get_block, NULL); + ret = __dax_fault(vma, vmf, ext2_get_block); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, } down_read(&ei->dax_sem); - ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 37e28082885a..7a6398867ff2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); + result = __dax_fault(vma, vmf, ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block, NULL); + ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 569938a4a357..c2946f436a3a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1558,7 +1558,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); @@ -1592,7 +1592,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1629,8 +1629,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - NULL); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/include/linux/dax.h b/include/linux/dax.h index 636dd59ab505..7c45ac7ea1d1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -10,10 +10,8 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, - dax_iodone_t); -int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, - dax_iodone_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); @@ -27,21 +25,20 @@ static inline struct page *read_dax_sector(struct block_device *bdev, #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, - unsigned int flags, get_block_t, dax_iodone_t); + unsigned int flags, get_block_t); int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, - unsigned int flags, get_block_t, dax_iodone_t); + unsigned int flags, get_block_t); #else static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags, get_block_t gb, - dax_iodone_t di) + pmd_t *pmd, unsigned int flags, get_block_t gb) { return VM_FAULT_FALLBACK; } #define __dax_pmd_fault dax_pmd_fault #endif int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) -#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) +#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) +#define __dax_mkwrite(vma, vmf, gb) __dax_fault(vma, vmf, gb) static inline bool vma_is_dax(struct vm_area_struct *vma) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 70e61b58baaf..9f2813090d1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); -typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 From 9b6cd5f76d60b563d75e55e432e03ed134761432 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 16 May 2016 17:17:04 +0200 Subject: [PATCH 03/20] ext2: Fix block zeroing in ext2_get_blocks() for DAX When zeroing allocated blocks for DAX, we accidentally zeroed only the first allocated block instead of all of them. So far this problem is hidden by the fact that page faults always need only a single block and DAX write code zeroes blocks again. But the zeroing in DAX code is racy and needs to be removed so fix the zeroing in ext2 to zero all allocated blocks. Reported-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/ext2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6bd58e6ff038..038d0ed5f565 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -740,7 +740,7 @@ static int ext2_get_blocks(struct inode *inode, err = dax_clear_sectors(inode->i_sb->s_bdev, le32_to_cpu(chain[depth-1].key) << (inode->i_blkbits - 9), - 1 << inode->i_blkbits); + count << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; From 86b0624e42d03a424e9571b8591d191c436f9af1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 May 2016 11:58:49 +0200 Subject: [PATCH 04/20] ext2: Avoid DAX zeroing to corrupt data Currently ext2 zeroes any data blocks allocated for DAX inode however it still returns them as BH_New. Thus DAX code zeroes them again in dax_insert_mapping() which can possibly overwrite the data that has been already stored to those blocks by a racing dax_io(). Avoid marking pre-zeroed buffers as new. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/ext2/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 038d0ed5f565..9a14af3b1a69 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -745,11 +745,11 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } + } else + set_buffer_new(bh_result); ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); - set_buffer_new(bh_result); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) From 2b10945c532c165a824f541df76a15ed0be04d78 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 May 2016 11:58:50 +0200 Subject: [PATCH 05/20] dax: Remove dead zeroing code from fault handlers Now that all filesystems zero out blocks allocated for a fault handler, we can just remove the zeroing from the handler itself. Also add checks that no filesystem returns to us unwritten or new buffer. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/dax.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 83e7894d86d8..ccb8bc399d78 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -587,11 +587,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, error = PTR_ERR(dax.addr); goto out; } - - if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(dax.addr, PAGE_SIZE); - wmb_pmem(); - } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, @@ -722,7 +717,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } /* Filesystem should not return unwritten buffers to us! */ - WARN_ON_ONCE(buffer_unwritten(&bh)); + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); error = dax_insert_mapping(inode, &bh, vma, vmf); out: @@ -854,7 +849,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; - WARN_ON_ONCE(buffer_unwritten(&bh)); + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); } bdev = bh.b_bdev; @@ -953,14 +948,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } - - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - clear_pmem(dax.addr, PMD_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } dax_unmap_atomic(bdev, &dax); /* From 069c77bc9eaee70fa9ecbd942372b1693b8cdeb0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 May 2016 11:58:51 +0200 Subject: [PATCH 06/20] dax: Remove zeroing from dax_io() All the filesystems are now zeroing blocks themselves for DAX IO to avoid races between dax_io() and dax_fault(). Remove the zeroing code from dax_io() and add warning to catch the case when somebody unexpectedly returns new or unwritten buffer. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/dax.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index ccb8bc399d78..7c0036dd1570 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -119,18 +119,6 @@ int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) } EXPORT_SYMBOL_GPL(dax_clear_sectors); -/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ -static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, - loff_t pos, loff_t end) -{ - loff_t final = end - pos + first; /* The final byte of the buffer */ - - if (first > 0) - clear_pmem(addr, first); - if (final < size) - clear_pmem(addr + final, size - final); -} - static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); @@ -169,6 +157,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; + unsigned blkbits = inode->i_blkbits; + sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) + >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); @@ -176,7 +167,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, while (pos < end) { size_t len; if (pos == max) { - unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); @@ -192,6 +182,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; + /* + * We allow uninitialized buffers for writes + * beyond EOF as those cannot race with faults + */ + WARN_ON_ONCE( + (buffer_new(bh) && block < file_blks) || + (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -211,11 +208,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, rc = map_len; break; } - if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(dax.addr, map_len, first, - pos, end); - need_wmb = true; - } dax.addr += first; size = map_len - first; } From c3d98e39d5b37320b15f227686575d58f676e6ef Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 May 2016 11:58:52 +0200 Subject: [PATCH 07/20] dax: Remove pointless writeback from dax_do_io() dax_do_io() is calling filemap_write_and_wait() if DIO_LOCKING flags is set. Presumably this was copied over from direct IO code. However DAX inodes have no pagecache pages to write so the call is pointless. Remove it. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/dax.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 7c0036dd1570..237581441bc1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -268,15 +268,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { - struct address_space *mapping = inode->i_mapping; + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); - retval = filemap_write_and_wait_range(mapping, pos, end - 1); - if (retval) { - inode_unlock(inode); - goto out; - } - } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) @@ -297,7 +290,6 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); - out: return retval; } EXPORT_SYMBOL_GPL(dax_do_io); From 7795bec89ebf927ea3ad9ed5f396c227e5c73271 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 May 2016 11:58:53 +0200 Subject: [PATCH 08/20] dax: Remove redundant inode size checks Callers of dax fault handlers must make sure these calls cannot race with truncate. Thus it is enough to check inode size when entering the function and we don't have to recheck it again later in the handler. Note that inode size itself can be decreased while the fault handler runs but filesystem locking prevents against any radix tree or block mapping information changes resulting from the truncate and that is what we really care about. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Vishal Verma --- fs/dax.c | 60 +------------------------------------------------------- 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 237581441bc1..9bc6624251b4 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -305,20 +305,11 @@ EXPORT_SYMBOL_GPL(dax_do_io); static int dax_load_hole(struct address_space *mapping, struct page *page, struct vm_fault *vmf) { - unsigned long size; - struct inode *inode = mapping->host; if (!page) page = find_or_create_page(mapping, vmf->pgoff, GFP_KERNEL | __GFP_ZERO); if (!page) return VM_FAULT_OOM; - /* Recheck i_size under page lock to avoid truncate race */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) { - unlock_page(page); - put_page(page); - return VM_FAULT_SIGBUS; - } vmf->page = page; return VM_FAULT_LOCKED; @@ -549,24 +540,10 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, .sector = to_sector(bh, inode), .size = bh->b_size, }; - pgoff_t size; int error; i_mmap_lock_read(mapping); - /* - * Check truncate didn't happen while we were allocating a block. - * If it did, this block may or may not be still allocated to the - * file. We can't tell the filesystem to free it because we can't - * take i_mutex here. In the worst case, the file still has blocks - * allocated past the end of the file. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - error = -EIO; - goto out; - } - if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; @@ -632,15 +609,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, put_page(page); goto repeat; } - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - /* - * We have a struct page covering a hole in the file - * from a read fault and we've raced with a truncate - */ - error = -EIO; - goto unlock_page; - } } error = get_block(inode, block, &bh, 0); @@ -673,17 +641,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock_page; vmf->page = page; - if (!page) { + if (!page) i_mmap_lock_read(mapping); - /* Check we didn't race with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); - error = -EIO; - goto out; - } - } return VM_FAULT_LOCKED; } @@ -861,23 +820,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, i_mmap_lock_read(mapping); - /* - * If a truncate happened while we were allocating blocks, we may - * leave blocks allocated to the file that are beyond EOF. We can't - * take i_mutex here, so just leave them hanging; they'll be freed - * when the file is deleted. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) { - result = VM_FAULT_SIGBUS; - goto out; - } - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, - "offset + huge page size > file size"); - goto fallback; - } - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; From 2af3a8159cd204fc8437ed2f75863f0fb930f0d0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 10 May 2016 10:23:52 -0600 Subject: [PATCH 09/20] block: Add vfs_msg() interface In preparation of moving DAX capability checks to the block layer from filesystem code, add a VFS message interface that aligns with filesystem's message format. For instance, a vfs_msg() message followed by XFS messages in case of a dax mount error may look like: VFS (pmem0p1): error: unaligned partition for dax XFS (pmem0p1): DAX unsupported by block device. Turning off DAX. XFS (pmem0p1): Mounting V5 Filesystem : vfs_msg() is largely based on ext4_msg(). Signed-off-by: Toshi Kani Reviewed-by: Christoph Hellwig Cc: Alexander Viro Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Jan Kara Cc: Dave Chinner Cc: Dan Williams Cc: Ross Zwisler Cc: Christoph Hellwig Cc: Boaz Harrosh Signed-off-by: Vishal Verma --- fs/block_dev.c | 12 ++++++++++++ include/linux/blkdev.h | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index b25bb230b28a..91e0ec0233c0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -50,6 +50,18 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 669e419d6234..78c48ab22f46 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -767,6 +767,17 @@ static inline void rq_flush_dcache_pages(struct request *rq) } #endif +#ifdef CONFIG_PRINTK +#define vfs_msg(sb, level, fmt, ...) \ + __vfs_msg(sb, level, fmt, ##__VA_ARGS__) +#else +#define vfs_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __vfs_msg(sb, "", " "); \ +} while (0) +#endif + extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); From 2d96afc8f70ef86c66a0b5d80c24a27d6dd13df3 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 10 May 2016 10:23:53 -0600 Subject: [PATCH 10/20] block: Add bdev_dax_supported() for dax mount checks DAX imposes additional requirements to a device. Add bdev_dax_supported() which performs all the precondition checks necessary for filesystem to mount the device with dax option. Also add a new check to verify if a partition is aligned by 4KB. When a partition is unaligned, any dax read/write access fails, except for metadata update. Signed-off-by: Toshi Kani Reviewed-by: Christoph Hellwig Cc: Alexander Viro Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Jan Kara Cc: Dave Chinner Cc: Dan Williams Cc: Ross Zwisler Cc: Christoph Hellwig Cc: Boaz Harrosh Signed-off-by: Vishal Verma --- fs/block_dev.c | 45 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 46 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 91e0ec0233c0..518cde62c01c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -509,6 +509,51 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) } EXPORT_SYMBOL_GPL(bdev_direct_access); +/** + * bdev_dax_supported() - Check if the device supports dax for filesystem + * @sb: The superblock of the device + * @blocksize: The block size of the device + * + * This is a library function for filesystems to check if the block device + * can be mounted with dax option. + * + * Return: negative errno if unsupported, 0 if supported. + */ +int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + struct blk_dax_ctl dax = { + .sector = 0, + .size = PAGE_SIZE, + }; + int err; + + if (blocksize != PAGE_SIZE) { + vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); + return -EINVAL; + } + + err = bdev_direct_access(sb->s_bdev, &dax); + if (err < 0) { + switch (err) { + case -EOPNOTSUPP: + vfs_msg(sb, KERN_ERR, + "error: device does not support dax"); + break; + case -EINVAL: + vfs_msg(sb, KERN_ERR, + "error: unaligned partition for dax"); + break; + default: + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%d)", err); + } + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bdev_dax_supported); + /* * pseudo-fs */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 78c48ab22f46..71231a55debd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1688,6 +1688,7 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); +extern int bdev_dax_supported(struct super_block *, int); #else /* CONFIG_BLOCK */ struct block_device; From 87eefeb4e80b1526be83e670a08d059a003b7d0e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 10 May 2016 10:23:54 -0600 Subject: [PATCH 11/20] ext4: Add alignment check for DAX mount When a partition is not aligned by 4KB, mount -o dax succeeds, but any read/write access to the filesystem fails, except for metadata update. Call bdev_dax_supported() to perform proper precondition checks which includes this partition alignment check. Reported-by: Micah Parrish Signed-off-by: Toshi Kani Reviewed-by: Jan Kara Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Jan Kara Cc: Dan Williams Cc: Ross Zwisler Cc: Christoph Hellwig Cc: Boaz Harrosh Signed-off-by: Vishal Verma --- fs/ext4/super.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20c5d52253b4..3822a5aedc61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3417,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext4_msg(sb, KERN_ERR, - "error: device does not support dax"); - goto failed_mount; - } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { From 284854be2b85e967d17c098a1d4c176b5cd37eb3 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 10 May 2016 10:23:55 -0600 Subject: [PATCH 12/20] ext2: Add alignment check for DAX mount When a partition is not aligned by 4KB, mount -o dax succeeds, but any read/write access to the filesystem fails, except for metadata update. Call bdev_dax_supported() to perform proper precondition checks which includes this partition alignment check. Signed-off-by: Toshi Kani Reviewed-by: Jan Kara Cc: Jan Kara Cc: Dan Williams Cc: Ross Zwisler Cc: Christoph Hellwig Cc: Boaz Harrosh Signed-off-by: Vishal Verma --- fs/ext2/super.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b78caf25f746..1d9379568aa8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext2_msg(sb, KERN_ERR, - "error: device does not support dax"); - goto failed_mount; - } } /* If the blocksize doesn't match, re-read the thing.. */ From 1e937cddd1ff4b823fc9d148ebabd443f105211e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 10 May 2016 10:23:56 -0600 Subject: [PATCH 13/20] xfs: Add alignment check for DAX mount When a partition is not aligned by 4KB, mount -o dax succeeds, but any read/write access to the filesystem fails, except for metadata update. Call bdev_dax_supported() to perform proper precondition checks which includes this partition alignment check. Signed-off-by: Toshi Kani Reviewed-by: Christoph Hellwig Cc: Dave Chinner Cc: Dan Williams Cc: Ross Zwisler Cc: Christoph Hellwig Cc: Boaz Harrosh Signed-off-by: Vishal Verma --- fs/xfs/xfs_super.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 187e14b696c2..39f4e6ab3d36 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1558,14 +1558,12 @@ xfs_fs_fill_super( if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - if (sb->s_blocksize != PAGE_SIZE) { + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + + error = bdev_dax_supported(sb, sb->s_blocksize); + if (error) { xfs_alert(mp, - "Filesystem block size invalid for DAX Turning DAX off."); - mp->m_flags &= ~XFS_MOUNT_DAX; - } else if (!sb->s_bdev->bd_disk->fops->direct_access) { - xfs_alert(mp, - "Block device does not support DAX Turning DAX off."); + "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } } From a8078b1fc616da6112eb95f0063cd34531d4ccf0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 10 May 2016 10:23:57 -0600 Subject: [PATCH 14/20] block: Update blkdev_dax_capable() for consistency blkdev_dax_capable() is similar to bdev_dax_supported(), but needs to remain as a separate interface for checking dax capability of a raw block device. Rename and relocate blkdev_dax_capable() to keep them maintained consistently, and call bdev_direct_access() for the dax capability check. There is no change in the behavior. Link: https://lkml.org/lkml/2016/5/9/950 Signed-off-by: Toshi Kani Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Jens Axboe Cc: Andreas Dilger Cc: Jan Kara Cc: Dave Chinner Cc: Dan Williams Cc: Ross Zwisler Cc: Christoph Hellwig Cc: Boaz Harrosh Signed-off-by: Vishal Verma --- block/ioctl.c | 30 ------------------------------ fs/block_dev.c | 39 +++++++++++++++++++++++++++++++++++++-- include/linux/blkdev.h | 1 + include/linux/fs.h | 8 -------- 4 files changed, 38 insertions(+), 40 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 4ff1f92f89ca..7eeda072dc70 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -407,35 +406,6 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } -#ifdef CONFIG_FS_DAX -bool blkdev_dax_capable(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - - if (!disk->fops->direct_access) - return false; - - /* - * If the partition is not aligned on a page boundary, we can't - * do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) - || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) - return false; - - /* - * If the device has known bad blocks, force all I/O through the - * driver / page cache. - * - * TODO: support finer grained dax error handling - */ - if (disk->bb && disk->bb->count) - return false; - - return true; -} -#endif - static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 518cde62c01c..8477d4501b1e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "internal.h" @@ -554,6 +555,40 @@ int bdev_dax_supported(struct super_block *sb, int blocksize) } EXPORT_SYMBOL_GPL(bdev_dax_supported); +/** + * bdev_dax_capable() - Return if the raw device is capable for dax + * @bdev: The device for raw block device access + */ +bool bdev_dax_capable(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + }; + + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + + dax.sector = 0; + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + + return true; +} + /* * pseudo-fs */ @@ -1295,7 +1330,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } @@ -1332,7 +1367,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 71231a55debd..27cbefe8c985 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1689,6 +1689,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); extern int bdev_dax_supported(struct super_block *, int); +extern bool bdev_dax_capable(struct block_device *); #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f2813090d1b..17f934fcf564 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2319,14 +2319,6 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); -#ifdef CONFIG_FS_DAX -extern bool blkdev_dax_capable(struct block_device *bdev); -#else -static inline bool blkdev_dax_capable(struct block_device *bdev) -{ - return false; -} -#endif extern struct super_block *blockdev_superblock; From 8b3db9798c689ba2dc509f6ca1a8c7e1c1a3e62a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 24 Feb 2016 14:02:06 -0800 Subject: [PATCH 15/20] dax: fallback from pmd to pte on error In preparation for consulting a badblocks list in pmem_direct_access(), teach dax_pmd_fault() to fallback rather than fail immediately upon encountering an error. The thought being that reducing the span of the dax request may avoid the error region. Reviewed-by: Jeff Moyer Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- fs/dax.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 9bc6624251b4..d602410d8e52 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -855,8 +855,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, long length = dax_map_atomic(bdev, &dax); if (length < 0) { - result = VM_FAULT_SIGBUS; - goto out; + dax_pmd_dbg(&bh, address, "dax-error fallback"); + goto fallback; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); From 0a70bd43053331d99881211e1d09f32de531432f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 24 Feb 2016 14:02:11 -0800 Subject: [PATCH 16/20] dax: enable dax in the presence of known media errors (badblocks) 1/ If a mapping overlaps a bad sector fail the request. 2/ Do not opportunistically report more dax-capable capacity than is requested when errors present. Reviewed-by: Jeff Moyer Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams [vishal: fix a conflict with system RAM collision patches] [vishal: add a 'size' parameter to ->direct_access] [vishal: fix a conflict with DAX alignment check patches] Signed-off-by: Vishal Verma --- arch/powerpc/sysdev/axonram.c | 2 +- drivers/block/brd.c | 2 +- drivers/nvdimm/pmem.c | 10 +++++++++- drivers/s390/block/dcssblk.c | 4 ++-- fs/block_dev.c | 13 +------------ include/linux/blkdev.h | 2 +- 6 files changed, 15 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index 0d112b94d91d..ff75d70f7285 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 51a071e32221..c04bd9bc39fd 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index f798899338ed..c447579bd853 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -182,14 +182,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct pmem_device *pmem = bdev->bd_disk->private_data; resource_size_t offset = sector * 512 + pmem->data_offset; + if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) + return -EIO; *kaddr = pmem->virt_addr + offset; *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); + /* + * If badblocks are present, limit known good range to the + * requested range. + */ + if (unlikely(pmem->bb.count)) + return size; return pmem->size - pmem->pfn_pad - offset; } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 1bce9cf51b1e..6ac33984bc0f 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode); static blk_qc_t dcssblk_make_request(struct request_queue *q, struct bio *bio); static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void __pmem **kaddr, pfn_t *pfn); + void __pmem **kaddr, pfn_t *pfn, long size); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -883,7 +883,7 @@ fail: static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct dcssblk_dev_info *dev_info; unsigned long offset, dev_sz; diff --git a/fs/block_dev.c b/fs/block_dev.c index 8477d4501b1e..45839b27972c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include "internal.h" @@ -501,7 +500,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); if (!avail) return -ERANGE; if (avail > 0 && avail & ~PAGE_MASK) @@ -561,7 +560,6 @@ EXPORT_SYMBOL_GPL(bdev_dax_supported); */ bool bdev_dax_capable(struct block_device *bdev) { - struct gendisk *disk = bdev->bd_disk; struct blk_dax_ctl dax = { .size = PAGE_SIZE, }; @@ -577,15 +575,6 @@ bool bdev_dax_capable(struct block_device *bdev) if (bdev_direct_access(bdev, &dax) < 0) return false; - /* - * If the device has known bad blocks, force all I/O through the - * driver / page cache. - * - * TODO: support finer grained dax error handling - */ - if (disk->bb && disk->bb->count) - return false; - return true; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 27cbefe8c985..cf7c13c2c38d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1668,7 +1668,7 @@ struct block_device_operations { int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void __pmem **, - pfn_t *); + pfn_t *, long); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ From 3dc29161070ab14d065554c0ad58988ab77a7bfd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 15 Mar 2016 11:20:41 -0600 Subject: [PATCH 17/20] dax: use sb_issue_zerout instead of calling dax_clear_sectors dax_clear_sectors() cannot handle poisoned blocks. These must be zeroed using the BIO interface instead. Convert ext2 and XFS to use only sb_issue_zerout(). Reviewed-by: Jeff Moyer Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox [vishal: Also remove the dax_clear_sectors function entirely] Signed-off-by: Vishal Verma --- fs/dax.c | 32 -------------------------------- fs/ext2/inode.c | 8 ++++---- fs/xfs/xfs_bmap_util.c | 15 ++++----------- include/linux/dax.h | 1 - 4 files changed, 8 insertions(+), 48 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index d602410d8e52..0abbbb62981e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -87,38 +87,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) return page; } -/* - * dax_clear_sectors() is called from within transaction context from XFS, - * and hence this means the stack from this point must follow GFP_NOFS - * semantics for all operations. - */ -int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) -{ - struct blk_dax_ctl dax = { - .sector = _sector, - .size = _size, - }; - - might_sleep(); - do { - long count, sz; - - count = dax_map_atomic(bdev, &dax); - if (count < 0) - return count; - sz = min_t(long, count, SZ_128K); - clear_pmem(dax.addr, sz); - dax.size -= sz; - dax.sector += sz / 512; - dax_unmap_atomic(bdev, &dax); - cond_resched(); - } while (dax.size); - - wmb_pmem(); - return 0; -} -EXPORT_SYMBOL_GPL(dax_clear_sectors); - static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 9a14af3b1a69..17cbd6b696f2 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -737,10 +738,9 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_sectors(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) << - (inode->i_blkbits - 9), - count << inode->i_blkbits); + err = sb_issue_zeroout(inode->i_sb, + le32_to_cpu(chain[depth-1].key), count, + GFP_NOFS); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3b6309865c65..930ac6a17ce3 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -72,18 +72,11 @@ xfs_zero_extent( struct xfs_mount *mp = ip->i_mount; xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); sector_t block = XFS_BB_TO_FSBT(mp, sector); - ssize_t size = XFS_FSB_TO_B(mp, count_fsb); - - if (IS_DAX(VFS_I(ip))) - return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), - sector, size); - - /* - * let the block layer decide on the fastest method of - * implementing the zeroing. - */ - return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + block << (mp->m_super->s_blocksize_bits - 9), + count_fsb << (mp->m_super->s_blocksize_bits - 9), + GFP_NOFS, true); } /* diff --git a/include/linux/dax.h b/include/linux/dax.h index 7c45ac7ea1d1..7f853ffaa987 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -7,7 +7,6 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); -int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); From 679c8bd3b29428e736eabb7fc66a978f312f0c86 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 May 2016 10:47:04 +0200 Subject: [PATCH 18/20] dax: export a low-level __dax_zero_page_range helper This allows XFS to perform zeroing using the iomap infrastructure and avoid buffer heads. Reviewed-by: Jan Kara Signed-off-by: Christoph Hellwig [vishal: fix conflicts with dax-error-handling] Signed-off-by: Vishal Verma --- fs/dax.c | 35 ++++++++++++++++++++--------------- include/linux/dax.h | 7 +++++++ 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 0abbbb62981e..651d4b18ac29 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -947,6 +947,23 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + unsigned int offset, unsigned int length) +{ + struct blk_dax_ctl dax = { + .sector = sector, + .size = PAGE_SIZE, + }; + + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); + wmb_pmem(); + dax_unmap_atomic(bdev, &dax); + return 0; +} +EXPORT_SYMBOL_GPL(__dax_zero_page_range); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated @@ -982,23 +999,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; err = get_block(inode, index, &bh, 0); - if (err < 0) + if (err < 0 || !buffer_written(&bh)) return err; - if (buffer_written(&bh)) { - struct block_device *bdev = bh.b_bdev; - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PAGE_SIZE, - }; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - wmb_pmem(); - dax_unmap_atomic(bdev, &dax); - } - - return 0; + return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), + offset, length); } EXPORT_SYMBOL_GPL(dax_zero_page_range); diff --git a/include/linux/dax.h b/include/linux/dax.h index 7f853ffaa987..90fbc99e5313 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,12 +14,19 @@ int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); +int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + unsigned int offset, unsigned int length); #else static inline struct page *read_dax_sector(struct block_device *bdev, sector_t n) { return ERR_PTR(-ENXIO); } +static inline int __dax_zero_page_range(struct block_device *bdev, + sector_t sector, unsigned int offset, unsigned int length) +{ + return -ENXIO; +} #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE From 4b0228fa1d753f77fe0e6cf4c41398ec77dfbd2a Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 21 Apr 2016 15:13:46 -0400 Subject: [PATCH 19/20] dax: for truncate/hole-punch, do zeroing through the driver if possible In the truncate or hole-punch path in dax, we clear out sub-page ranges. If these sub-page ranges are sector aligned and sized, we can do the zeroing through the driver instead so that error-clearing is handled automatically. For sub-sector ranges, we still have to rely on clear_pmem and have the possibility of tripping over errors. Cc: Dan Williams Cc: Ross Zwisler Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Vishal Verma --- Documentation/filesystems/dax.txt | 32 +++++++++++++++++++++++++++++++ fs/dax.c | 30 ++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 7bde64014a89..ce4587d257d2 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -79,6 +79,38 @@ These filesystems may be used for inspiration: - ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt +Handling Media Errors +--------------------- + +The libnvdimm subsystem stores a record of known media error locations for +each pmem block device (in gendisk->badblocks). If we fault at such location, +or one with a latent error not yet discovered, the application can expect +to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply +writing the affected sectors (through the pmem driver, and if the underlying +NVDIMM supports the clear_poison DSM defined by ACPI). + +Since DAX IO normally doesn't go through the driver/bio path, applications or +sysadmins have an option to restore the lost data from a prior backup/inbuilt +redundancy in the following ways: + +1. Delete the affected file, and restore from a backup (sysadmin route): + This will free the file system blocks that were being used by the file, + and the next time they're allocated, they will be zeroed first, which + happens through the driver, and will clear bad sectors. + +2. Truncate or hole-punch the part of the file that has a bad-block (at least + an entire aligned sector has to be hole-punched, but not necessarily an + entire filesystem block). + +These are the two basic paths that allow DAX filesystems to continue operating +in the presence of media errors. More robust error recovery mechanisms can be +built on top of this in the future, for example, involving redundancy/mirroring +provided at the block layer through DM, or additionally, at the filesystem +level. These would have to rely on the above two tenets, that error clearing +can happen either by sending an IO through the driver, or zeroing (also through +the driver). + + Shortcomings ------------ diff --git a/fs/dax.c b/fs/dax.c index 651d4b18ac29..0b9a16934017 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -947,6 +947,19 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +static bool dax_range_is_aligned(struct block_device *bdev, + unsigned int offset, unsigned int length) +{ + unsigned short sector_size = bdev_logical_block_size(bdev); + + if (!IS_ALIGNED(offset, sector_size)) + return false; + if (!IS_ALIGNED(length, sector_size)) + return false; + + return true; +} + int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length) { @@ -955,11 +968,18 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, .size = PAGE_SIZE, }; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - wmb_pmem(); - dax_unmap_atomic(bdev, &dax); + if (dax_range_is_aligned(bdev, offset, length)) { + sector_t start_sector = dax.sector + (offset >> 9); + + return blkdev_issue_zeroout(bdev, start_sector, + length >> 9, GFP_NOFS, true); + } else { + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); + wmb_pmem(); + dax_unmap_atomic(bdev, &dax); + } return 0; } EXPORT_SYMBOL_GPL(__dax_zero_page_range); From 40543f62cbdce42633e3fe10923099feee272e1f Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 21 Apr 2016 16:45:56 -0400 Subject: [PATCH 20/20] dax: fix a comment in dax_zero_page_range and dax_truncate_page The distinction between PAGE_SIZE and PAGE_CACHE_SIZE was removed in 09cbfea mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros The comments for the above functions described a distinction between those, that is now redundant, so remove those paragraphs Cc: Kirill A. Shutemov Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Vishal Verma --- fs/dax.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 0b9a16934017..ea936b3d93dc 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -995,12 +995,6 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); * page in a DAX file. This is intended for hole-punch operations. If * you are truncating a file, the helper function dax_truncate_page() may be * more convenient. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, get_block_t get_block) @@ -1035,12 +1029,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); * * Similar to block_truncate_page(), this function can be called by a * filesystem when it is truncating a DAX file to handle the partial page. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) {