dax: Call ->iomap_begin without entry lock during dax fault

Currently ->iomap_begin() handler is called with entry lock held. If the
filesystem held any locks between ->iomap_begin() and ->iomap_end()
(such as ext4 which will want to hold transaction open), this would cause
lock inversion with the iomap_apply() from standard IO path which first
calls ->iomap_begin() and only then calls ->actor() callback which grabs
entry locks for DAX (if it faults when copying from/to user provided
buffers).

Fix the problem by nesting grabbing of entry lock inside ->iomap_begin()
- ->iomap_end() pair.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
Jan Kara 2016-10-19 14:34:31 +02:00 коммит произвёл Dan Williams
Родитель f449b936f1
Коммит 9f141d6ef6
1 изменённых файлов: 67 добавлений и 56 удалений

123
fs/dax.c
Просмотреть файл

@ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
} }
EXPORT_SYMBOL_GPL(dax_iomap_rw); EXPORT_SYMBOL_GPL(dax_iomap_rw);
static int dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
if (error == -ENOMEM)
return VM_FAULT_OOM;
return VM_FAULT_SIGBUS;
}
/** /**
* dax_iomap_fault - handle a page fault on a DAX file * dax_iomap_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred * @vma: The virtual memory area where the fault occurred
@ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (pos >= i_size_read(inode)) if (pos >= i_size_read(inode))
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto out;
}
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE; flags |= IOMAP_WRITE;
@ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
*/ */
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error) if (error)
goto unlock_entry; return dax_fault_return(error);
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
error = -EIO; /* fs corruption? */ vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
}
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
vmf_ret = dax_fault_return(PTR_ERR(entry));
goto finish_iomap; goto finish_iomap;
} }
@ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
} }
if (error) if (error)
goto finish_iomap; goto error_unlock_entry;
__SetPageUptodate(vmf->cow_page); __SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf); vmf_ret = finish_fault(vmf);
if (!vmf_ret) if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW; vmf_ret = VM_FAULT_DONE_COW;
goto finish_iomap; goto unlock_entry;
} }
switch (iomap.type) { switch (iomap.type) {
@ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
} }
error = dax_insert_mapping(mapping, iomap.bdev, sector, error = dax_insert_mapping(mapping, iomap.bdev, sector,
PAGE_SIZE, &entry, vma, vmf); PAGE_SIZE, &entry, vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
break; break;
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) { if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf); vmf_ret = dax_load_hole(mapping, &entry, vmf);
goto finish_iomap; goto unlock_entry;
} }
/*FALLTHRU*/ /*FALLTHRU*/
default: default:
@ -1183,30 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break; break;
} }
finish_iomap: error_unlock_entry:
if (ops->iomap_end) { vmf_ret = dax_fault_return(error) | major;
if (error || (vmf_ret & VM_FAULT_ERROR)) {
/* keep previous error */
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
&iomap);
} else {
error = ops->iomap_end(inode, pos, PAGE_SIZE,
PAGE_SIZE, flags, &iomap);
}
}
unlock_entry: unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry); put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out: finish_iomap:
if (error == -ENOMEM) if (ops->iomap_end) {
return VM_FAULT_OOM | major; int copied = PAGE_SIZE;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error < 0 && error != -EBUSY) if (vmf_ret & VM_FAULT_ERROR)
return VM_FAULT_SIGBUS | major; copied = 0;
if (vmf_ret) { /*
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ * The fault is done by now and there's no way back (other
return vmf_ret; * thread may be already happily using PTE we have installed).
* Just ignore error from ->iomap_end since we cannot do much
* with it.
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
} }
return VM_FAULT_NOPAGE | major; return vmf_ret;
} }
EXPORT_SYMBOL_GPL(dax_iomap_fault); EXPORT_SYMBOL_GPL(dax_iomap_fault);
@ -1330,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((pgoff | PG_PMD_COLOUR) > max_pgoff) if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
goto fallback; goto fallback;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto fallback;
/* /*
* Note that we don't use iomap_apply here. We aren't doing I/O, only * Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way * setting up a mapping, so really we're using iomap_begin() as a way
@ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pos = (loff_t)pgoff << PAGE_SHIFT; pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error) if (error)
goto unlock_entry; goto fallback;
if (iomap.offset + iomap.length < pos + PMD_SIZE) if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap; goto finish_iomap;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto finish_iomap;
vmf.pgoff = pgoff; vmf.pgoff = pgoff;
vmf.flags = flags; vmf.flags = flags;
vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (WARN_ON_ONCE(write)) if (WARN_ON_ONCE(write))
goto finish_iomap; goto unlock_entry;
result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
&entry); &entry);
break; break;
@ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
break; break;
} }
finish_iomap:
if (ops->iomap_end) {
if (result == VM_FAULT_FALLBACK) {
ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
&iomap);
} else {
error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
iomap_flags, &iomap);
if (error)
result = VM_FAULT_FALLBACK;
}
}
unlock_entry: unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry); put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
if (result == VM_FAULT_FALLBACK)
copied = 0;
/*
* The fault is done by now and there's no way back (other
* thread may be already happily using PMD we have installed).
* Just ignore error from ->iomap_end since we cannot do much
* with it.
*/
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
fallback: fallback:
if (result == VM_FAULT_FALLBACK) { if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, pmd, address); split_huge_pmd(vma, pmd, address);